DHH20080612-01

InfoInfo
Search:    
# tenfold-crossvalidation-sample.py
# old bailey
#
# given a list of trials, shuffle and divide
# into ten samples of approximately equal size

import os, random

# if output directory doesn't exist, create it
outdirname = 'Samples_1830s'
if os.path.exists(outdirname) == 0: os.mkdir(outdirname)

# get a list of trials
f = open('trial-ids-1830s.txt', 'r')
triallist = f.readlines()
f.close()

# shuffle it, changing list in place
random.shuffle(triallist)

# do floor division to get basic sample size and remainder
numtrials = len(triallist)
samplesize = numtrials // 10
base = samplesize * 10
remainder = numtrials - base
print "Trials: %d; Base sample: %d; Remainder: %d" % (numtrials, samplesize, remainder)

# get basic samples
sample = {}
for i in range(0,10):
    index = i * samplesize
    offset = index + samplesize
    sample[i] = triallist[index:offset]

# distribute remainder as equally as possible
tailend = range(base, base+remainder)
i = 0
for t in tailend:
    sample[i].append(triallist[t])
    i += 1

# do sanity check
sanity = 0
for k in sample.keys(): sanity += len(sample[k])
if sanity != numtrials:
    print "Sanity check failed"
    quit()

# write samples to files
for k in sample.keys():
    outfilename = outdirname + '\\sample' + str(k) + '.txt'
    f = open(outfilename, 'w')
    for tr in sample[k]: f.write(str(tr))
    f.close()
This is a Wiki Spot wiki. Wiki Spot is a non-profit organization that helps communities collaborate via wikis.