# online-learning.py
# old bailey
#
# given labelled, chronological sequence of data
# predicts category for unlabelled current item
# then is trained on labelled item
# outputs statistics
import os, string, re, sys
from bayesian import *
# the routine to extract features has to be bypassed
# this expects a string made by concatenating terms
# with highest TF/IDF
def passtfidf(doc):
wordlist = doc.split(' ')
return dict([(w,1) for w in wordlist])
# read in a list of trials
f = open('trial-ids-1830s.txt', 'r')
triallist = f.readlines()
f.close()
# given a trial file name, return long integer
# that can be used for sorting
def trialtoint(trialname):
pattern = re.compile(r'(\d{8})-(\d+)', re.UNICODE)
match = pattern.search(trialname)
date = match.group(1)
id = match.group(2)
return long("%8d%06d" % (long(date), long(id)))
# sort trial list into chronological order
triallist.sort(lambda x, y: cmp(trialtoint(x),trialtoint(y)))
# overall sample size
samplesize = len(triallist)
# output directory
outdir = 'Online_Runs_1830s'
if os.path.exists(outdir) == 0: os.mkdir(outdir)
# test all offence categories that occur 10 or more times in the data set
offencecountfile = 'offence-counts-1830s.txt'
f = open(offencecountfile, 'r')
offencecountlist = f.readlines()
f.close()
toprocess = []
for k in offencecountlist:
linein = k.split('|')
if int(linein[1].rstrip()) > 9:
toprocess.append(linein[0])
# run the learner on each offence
for offencefile in toprocess:
print "Processing ",
print offencefile
sys.stdout.flush()
# read in a list of trials that belong in the offence category
offencedir = 'Offences_1830s'
f = open(offencedir + '\\' + offencefile, 'r')
offencelist = f.readlines()
offencelist = [x.rstrip() for x in offencelist]
f.close()
# number of instances of that offence in overall sample
offencecount = len(offencelist)
# define a learner
learner = 'tfidf'
numfeatures = 50
cl=naivebayes(passtfidf)
# response categories
guess = ''
actualcat = ''
hits = 0
misses = 0
falseps = 0
corrns = 0
# output style is 'full' or 'fast'
outputstyle = 'full'
# for 'fast' output style
reportincrement = 40
# open output file and write the file header
outfile = outdir + '\\online-tfidf' + str(numfeatures) + '-' + offencefile
f = open(outfile, 'w')
f.write('OLD BAILEY Online Learning Run\n\n')
f.write('Offence: ' + offencedir + '\\' + offencefile + '\n')
f.write('Learning run: tfidf, ' + str(numfeatures) + ' features\n')
if outputstyle == 'full':
f.write("\nTrial, Run, %6s, %3s, %5s, %6s, %7s, %6s\n" % ('Guess', 'Act', 'Hit', 'Miss', 'FalseP', 'CorrN'))
else:
f.write("\nRun, %6s, %7s, %8s, %6s\n" % ('Hit', 'Miss', 'FalseP', 'CorrN'))
# test and train learner
for i in range(0, samplesize):
t = triallist[i].rstrip()
if outputstyle == 'full':
f.write(str(trialtoint(t)))
f.write(",\t")
# read trial into string
trialstr = ''
ff = open('TFIDF_1830s\\tfidf_' + t, 'r')
whole = ff.readlines()
feat = min(len(whole)-1, numfeatures)
for k in range(0, feat):
linein = whole[k].split(',')
trialstr += str(linein[0])
trialstr += ' '
ff.close()
# use current state of learner to categorize trial
guess = cl.classify(trialstr.rstrip(),default='n')
# is this an instance of the offence category?
if t in offencelist:
# hit or miss
actualcat = 'y'
if guess == 'y': hits+=1
else: misses+=1
# train a positive instance
cl.train(trialstr,'y')
else:
# false positive or correct negative
actualcat = 'n'
if guess == 'y': falseps+=1
else: corrns+=1
# train a negative instance
cl.train(trialstr,'n')
# write results to data table
if outputstyle == 'full':
f.write("%06d, %3s, %3s, %06d, %06d, %06d, %06d\n" % (i+1, guess, actualcat, hits, misses, falseps, corrns))
elif (outputstyle == 'fast') and (i % reportincrement == 0):
f.write("%06d, %06d, %06d, %06d, %06d\n" % (i+1, hits, misses, falseps, corrns))
else:
continue
f.flush()
f.close()