import nltk words = nltk.corpus.gutenberg.words('austen-persuasion') #one way to do it: print "Tokens: %s\tTypes: %s" % (len(words),len(set(words))) #another way: tokens=0; types={} for word in words: tokens+=1 if word not in types: types[word]=1 print "Tokens: %s\tTypes: %s" % (tokens,len(types.keys()))