#!/usr/bin/python
#
# (C) 2013.08.28 Pegasus Epsilon <pegasus@pimpninjas.org>
import sys
import re
from collections import Counter
# Because Josh requested that I play nice.
def word_counter_boring_version (filename):
data = []
# with the given file
with open(filename) as f:
words = []
# for every line in the file
for l in f:
# lowercase and append every word on the line to a list.
words.extend(l.lower().split())
# then run the completed list through
# collections.Counter to get word/count pairs
for w, c in Counter(words).most_common():
# which we concatenate and add to a different list
data.append(w+" "+str(c))
# return the second list
return data
def word_counter (filename):
with open(filename) as f:
return [w + " " + str(c) for w, c in Counter([w for l in f for w in l.lower().split()]).most_common()]
def print_words (filename):
for l in sorted(word_counter(filename)):
print l
def print_top (filename):
for l in word_counter(filename)[:20]:
print l
'''
Wordcount exercise
Google's Python class
The main() below is already defined and complete. It calls print_words()
and print_top() functions which you write.
1. For the --count flag, implement a print_words(filename) function that
counts how often each word appears in the text and prints:
word1 count1
word2 count2
...
Print the above list in order sorted by word (python will sort punctuation
to come before letters -- that's fine). Store all the words as lowercase,
so 'The' and 'the' count as the same word.
2. For the --topcount flag, implement a print_top(filename) which is
similar to print_words() but which prints just the top 20 most common
words sorted so the most common word is first, then the next most common,
and so on.
Use str.split() (no arguments) to split on all whitespace.
Workflow: don't build the whole program at once. Get it to an intermediate
milestone and print your data structure and sys.exit(0).
When that's working, try for the next milestone.
Optional: define a helper function to avoid code duplication inside
print_words() and print_top().
'''
# This basic command line argument parsing code is provided and
# calls the print_words() and print_top() functions which you must define.
def main():
if len(sys.argv) != 3:
print 'usage: ./wordcount.py {--count | --topcount} file'
sys.exit(1)
option = sys.argv[1]
filename = sys.argv[2]
if option == '--count':
print_words(filename)
elif option == '--topcount':
print_top(filename)
else:
print 'unknown option: ' + option
sys.exit(1)
if __name__ == '__main__':
main()