SpLitter
?
Describe SpLitter # File: HW2.py # Written by: Deepak Kumar # Date: September 19, 2005 # import re print 'Running program HW2.py...' # open the text file and read its contents f = open('body.txt') content = f.read() # Remove '\n' since lines do not matter (more or less) noLine = re.sub('(\n)+', ' ', content) # Next, sentence boundaries occur at '.', '!', '?', and ':' # replace them with a '\n' thus giving one sentence per line sLine = re.sub('[.!:?]', '\n', noLine) # replace dashes ('-') with a space, commas, semi-colons, colons with nothing # replace single and double quotes with nothing sLine2 = re.sub("[,\-']", ' ', sLine) sLine3 = re.sub("[,';:]", '', sLine2) sLine3 = re.sub('"', '', sLine3) # split the text into sentences, each sentence is on one line sentences = sLine3.split('\n') nSentences = len(sentences) # number of sentences in text print 'The text has ' + str(nSentences) + ' sentences.' # extract words from the text words = [ ] for s in sentences: # for each sentence words = words + s.split(' ') # get the words words = [w for w in words if len(w) > 0] # remove empty strings nWords = len(words) # total words in text aveSentenceLength = nSentences/nWords # average sentence length print 'The text has a total of ' + str(nWords) + ' words in it.' print 'Average sentence length is ' + str(nWords/nSentences) + ' words/sentence.' # find out unique words, and count occurrences while at it! wDict = {} for w in words: if wDict.has_key(w): wDict[w] += 1 else: wDict[w] = 1 print 'There are a total of ' + str(len(wDict)) + ' words in this text' print 'End of processing.'