SpLitter?

Describe SpLitter

# File: HW2.py
# Written by: Deepak Kumar
# Date: September 19, 2005
#
import re

print 'Running program HW2.py...'
# open the text file and read its contents

f = open('body.txt')
content = f.read()

# Remove '\n' since lines do not matter (more or less)
noLine = re.sub('(\n)+', ' ', content)

# Next, sentence boundaries occur at '.', '!', '?', and ':'
# replace them with a '\n' thus giving one sentence per line
sLine = re.sub('[.!:?]', '\n', noLine)

# replace dashes ('-') with a space, commas, semi-colons, colons with nothing
# replace single and double quotes with nothing
sLine2 = re.sub("[,\-']", ' ', sLine)
sLine3 = re.sub("[,';:]", '', sLine2)
sLine3 = re.sub('"', '', sLine3)

# split the text into sentences, each sentence is on one line
sentences = sLine3.split('\n')
nSentences = len(sentences) # number of sentences in text
print 'The text has ' + str(nSentences) + ' sentences.'

# extract words from the text
words = [ ]
for s in sentences: # for each sentence
    words = words + s.split(' ')             # get the words
    words = [w for w in words if len(w) > 0] # remove empty strings

nWords = len(words) # total words in text
aveSentenceLength = nSentences/nWords # average sentence length

print 'The text has a total of ' + str(nWords) + ' words in it.'
print 'Average sentence length is ' + str(nWords/nSentences) + ' words/sentence.'

# find out unique words, and count occurrences while at it!
wDict = {} 
for w in words:
    if wDict.has_key(w):
        wDict[w] += 1
    else:
        wDict[w] = 1
print 'There are a total of ' + str(len(wDict)) + ' words in this text'
print 'End of processing.'