DHH20070702-01

InfoInfo
Search:    

Go to the [WWW]original post
Go back to the DHH Archive

# ncd-probe.py
#
# Test with Frobisher (DCB '34352')
#
# wjt
# http://digitalhistoryhacks.blogspot.com
#
# 1 jul 2007

from yahoo.search.web import WebSearch
import bz2
import random

pathstring = 'C:\Documents and Settings\HP_Administrator\My Documents\digital-history-datasets\DCB-txt\DCB-v01-txt'
filex = pathstring + '\\' + '34352' + '.txt'
xbytes = open(filex, 'r').read()
cx = bz2.compress(xbytes)

def ncd_probe(xbytes, cx, ybytes):
    # ybytes = open(filey, 'r').read()
    xybytes = xbytes + ybytes
    cy = bz2.compress(ybytes)
    cxy = bz2.compress(xybytes)
    n = (len(cxy) - len(cy)) / float(len(cx))
    return n

def printSortedDict(adict):
    keys = adict.keys()
    keys.sort()
    for k in keys:
        print k
        print adict[k]['Title']
        print adict[k]['Url']
        print adict[k]['Summary']
        print " "

app_id = "NCD-Probe-Demo"
srch = WebSearch(app_id, language='en')
srch.query = "Frobisher"
srch.results = 50

dom = srch.get_results()
results = srch.parse_results(dom)

ranked = {}
for res in results:
    # strip out search word from summary
    summary = str(res['Summary'])
    stripped_summary = summary.replace('Frobisher', '')
    distance = ncd_probe(xbytes, cx, stripped_summary)
    dstr = 'NCD: ' + str(distance)
    ranked[dstr] = res

printSortedDict(ranked)
This is a Wiki Spot wiki. Wiki Spot is a non-profit organization that helps communities collaborate via wikis.