DHH20080618-03

InfoInfo
Search:    
# cross-validate-tfidf-learner.py
# old bailey
#
# test performance of TF/IDF based learner on a
# cross-validation sample

import os, string, re, sys
from bayesian import *

# the routine to extract features has to be bypassed
# this expects a string made by concatenating terms
# with highest TF/IDF
def passtfidf(doc):
    wordlist = doc.split(' ')
    return dict([(w,1) for w in wordlist])

# set learner type and number of features
learner = 'tfidf'
numfeatures = 40

# read the 10 sample files into an array of lists of the form
# samplelist[sample_number][item_number]
sampledir = 'Samples_1830s'
samplelist = []
for i in range(0, 10):
    f = open(sampledir + '\\' + 'sample' + str(i) + '.txt', 'r')
    sample = []
    sample = f.readlines()
    sample = [x.rstrip() for x in sample]
    samplelist.append(sample)
    f.close

# offence being tested
offencedir = 'Offences_1830s'
offencefile = 'theft-simplelarceny.txt'
f = open(offencedir + '\\' + offencefile, 'r')
offencelist = f.readlines()
offencelist = [x.rstrip() for x in offencelist]
f.close()
offencecount = len(offencelist)

# trials
trialdir = 'TFIDF_1830s'

# open output file and write the file header
outfile = 'cross-val-tfidf-lrn.txt'
f = open(outfile, 'w')
f.write('OLD BAILEY Tenfold Cross-Validation Learning Run\n\n')
f.write('Offence: ' + offencedir + '\\' + offencefile + '\n')
f.write('Learning run: tfidf, ' + str(numfeatures) + ' features\n')
f.write("\nRun, %7s, %7s, %7s, %7s\n" % ('Hit', 'Miss', 'FalseP', 'CorrN'))

# run tests
cl = []
for i in range(0,10):

    # train a new learner for each run
    cl.append(naivebayes(passtfidf))

    # response categories
    hits = 0
    misses = 0
    falseps = 0
    corrns = 0

    # set testing sample to i
    print str(i) + ' Loading samples...'
    testingsample = []
    testingsample = samplelist[i]

    # set training sample to the concatenation of the others
    trainingsample = []
    for j in range(0, i): trainingsample.extend(samplelist[j])
    for j in range((i+1), 10): trainingsample.extend(samplelist[j])

    # train the learner on training sample
    print str(i) + ' Training'
    sys.stdout.flush()

    for r in trainingsample:
        trialstr = ''
        ff = open(trialdir + '\\tfidf_' + r, 'r')
        whole = ff.readlines()
        feat = min(len(whole)-1, numfeatures)
        for k in range(0, feat):
            linein = whole[k].split(',')
            trialstr += str(linein[0])
            trialstr += ' '
        ff.close()

        if r in offencelist:
            # print trialstr.rstrip()
            cl[i].train(trialstr.rstrip(),'y')
        else:
            # print trialstr.rstrip()
            cl[i].train(trialstr.rstrip(),'n')

    # test the learner on testing sample
    print str(i) + ' Testing'
    sys.stdout.flush()

    for t in testingsample:

        # read trial into string
        trialstr = ''
        ff = open(trialdir + '\\tfidf_' + t, 'r')
        whole = ff.readlines()
        feat = min(len(whole)-1, numfeatures)
        for k in range(0, feat):
            linein = whole[k].split(',')
            trialstr += str(linein[0])
            trialstr += ' '
        ff.close()

        # use learner to categorize trial
        g = cl[i].classify(trialstr.rstrip(),default='n')

        # compare categorization with actual category
        if t in offencelist:
            # hit or miss
            if g == 'y': hits+=1
            else: misses+=1
        else:
            # false positive or correct negative
            if g == 'y': falseps+=1
            else: corrns+=1

    # write out learner performance
    f.write("%03d, %07d, %07d, %07d, %07d\n" % (i, hits, misses, falseps, corrns))
    f.flush()

print 'Done'
sys.stdout.flush()

# close output file
f.close()
This is a Wiki Spot wiki. Wiki Spot is a non-profit organization that helps communities collaborate via wikis.