DHH20080609-03

InfoInfo
Search:    
# offence-category.py
# old bailey
#
# given a directory of trial files each marked with XML
# extract a list mapping trial id to offence

import os, sys, re
from BeautifulSoup import BeautifulStoneSoup

# given a directory string, return a list of file names
def getFileNames(dirstr):
    import os
    dircommand = 'dir ' + dirstr + ' /B'
    filelist = os.popen(dircommand).readlines()
    filelist = [x.rstrip() for x in filelist]
    return filelist

# given an XML tag describing an offence, return as a
# standardized string
def standardizeOffenceTags(offstring):
    stdstr = offstring.replace('<', '')
    stdstr = stdstr.replace('>', '')
    stdstr = stdstr.replace('\"', '')
    stdstr = stdstr.replace('category=', '')
    stdstr = stdstr.replace(' ', '-')
    return stdstr.lower()

# get a list of trial files to process
indirname = 'Mined_1830s'
filelist = getFileNames(indirname)

# scrape out the first child node of each offence
offencepattern = re.compile(r'<.*?>', re.UNICODE)

resultsfile = open('offence-categories-1830s.txt', 'w')

for fn in filelist:

    outstr = fn

    # read XML file into string and parse it
    f = open(indirname+'\\'+fn, 'r')
    fnxml = f.read()
    f.close()
    fnsoup = BeautifulStoneSoup(fnxml)
    offencelist = fnsoup.findAll('offence')

    # extract offences
    for o in offencelist:
        offence = o.contents[0]
        # one trial had a blank space in front of first node
        if offence == ' ': offence = o.contents[1]
        omatch = offencepattern.match(str(offence))
        offstr = omatch.group()
        outstr += ',' + standardizeOffenceTags(offstr)

    # write offence data to file
    resultsfile.write(outstr+'\n')
    resultsfile.flush()

    # provide feedback for user
    print outstr
    sys.stdout.flush()

resultsfile.close()
This is a Wiki Spot wiki. Wiki Spot is a non-profit organization that helps communities collaborate via wikis.