DHH20080625-01

InfoInfo
Search:    
# online-learning.py
# old bailey
#
# given labelled, chronological sequence of data
# predicts category for unlabelled current item
# then is trained on labelled item
# outputs statistics

import os, string, re, sys
from bayesian import *

# the routine to extract features has to be bypassed
# this expects a string made by concatenating terms
# with highest TF/IDF
def passtfidf(doc):
    wordlist = doc.split(' ')
    return dict([(w,1) for w in wordlist])

# read in a list of trials
f = open('trial-ids-1830s.txt', 'r')
triallist = f.readlines()
f.close()

# given a trial file name, return long integer
# that can be used for sorting
def trialtoint(trialname):
    pattern = re.compile(r'(\d{8})-(\d+)', re.UNICODE)
    match = pattern.search(trialname)
    date = match.group(1)
    id = match.group(2)
    return long("%8d%06d" % (long(date), long(id)))

# sort trial list into chronological order
triallist.sort(lambda x, y: cmp(trialtoint(x),trialtoint(y)))

# overall sample size
samplesize = len(triallist)

# output directory
outdir = 'Online_Runs_1830s'
if os.path.exists(outdir) == 0: os.mkdir(outdir)

# test all offence categories that occur 10 or more times in the data set
offencecountfile = 'offence-counts-1830s.txt'
f = open(offencecountfile, 'r')
offencecountlist = f.readlines()
f.close()
toprocess = []
for k in offencecountlist:
    linein = k.split('|')
    if int(linein[1].rstrip()) > 9:
        toprocess.append(linein[0])

# run the learner on each offence
for offencefile in toprocess:

    print "Processing ",
    print offencefile
    sys.stdout.flush()

    # read in a list of trials that belong in the offence category
    offencedir = 'Offences_1830s'
    f = open(offencedir + '\\' + offencefile, 'r')
    offencelist = f.readlines()
    offencelist = [x.rstrip() for x in offencelist]
    f.close()

    # number of instances of that offence in overall sample
    offencecount = len(offencelist)

    # define a learner
    learner = 'tfidf'
    numfeatures = 50
    cl=naivebayes(passtfidf)

    # response categories
    guess = ''
    actualcat = ''
    hits = 0
    misses = 0
    falseps = 0
    corrns = 0

    # output style is 'full' or 'fast'
    outputstyle = 'full'

    # for 'fast' output style
    reportincrement = 40

    # open output file and write the file header
    outfile = outdir + '\\online-tfidf' + str(numfeatures) + '-' + offencefile
    f = open(outfile, 'w')
    f.write('OLD BAILEY Online Learning Run\n\n')
    f.write('Offence: ' + offencedir + '\\' + offencefile + '\n')
    f.write('Learning run: tfidf, ' + str(numfeatures) + ' features\n')
    if outputstyle == 'full':
        f.write("\nTrial,          Run, %6s, %3s, %5s, %6s, %7s, %6s\n" % ('Guess', 'Act', 'Hit', 'Miss', 'FalseP', 'CorrN'))
    else:
        f.write("\nRun, %6s, %7s, %8s, %6s\n" % ('Hit', 'Miss', 'FalseP', 'CorrN'))

    # test and train learner
    for i in range(0, samplesize):
        t = triallist[i].rstrip()
        if outputstyle == 'full':
            f.write(str(trialtoint(t)))
            f.write(",\t")

        # read trial into string
        trialstr = ''
        ff = open('TFIDF_1830s\\tfidf_' + t, 'r')
        whole = ff.readlines()
        feat = min(len(whole)-1, numfeatures)
        for k in range(0, feat):
            linein = whole[k].split(',')
            trialstr += str(linein[0])
            trialstr += ' '
        ff.close()

        # use current state of learner to categorize trial
        guess = cl.classify(trialstr.rstrip(),default='n')

        # is this an instance of the offence category?
        if t in offencelist:
            # hit or miss
            actualcat = 'y'
            if guess == 'y': hits+=1
            else: misses+=1
            # train a positive instance
            cl.train(trialstr,'y')
        else:
            # false positive or correct negative
            actualcat = 'n'
            if guess == 'y': falseps+=1
            else: corrns+=1
            # train a negative instance
            cl.train(trialstr,'n')

        # write results to data table
        if outputstyle == 'full':
            f.write("%06d, %3s, %3s, %06d, %06d, %06d, %06d\n" % (i+1, guess, actualcat, hits, misses, falseps, corrns))
        elif (outputstyle == 'fast') and (i % reportincrement == 0):
            f.write("%06d, %06d, %06d, %06d, %06d\n" % (i+1, hits, misses, falseps, corrns))
        else:
            continue
        f.flush()

    f.close()
This is a Wiki Spot wiki. Wiki Spot is a 501(c)3 non-profit organization that helps communities collaborate via wikis.