Recent Changes for "DHH20060115-01" - Digital Historyhttp://digitalhistory.wikispot.org/DHH20060115-01Recent Changes of the page "DHH20060115-01" on Digital History.en-us DHH20060115-01http://digitalhistory.wikispot.org/DHH20060115-012008-09-05 19:02:52william.j.turkel <div id="content" class="wikipage content"> Differences for DHH20060115-01<p><strong></strong></p><table> <tr> <td> <span> Deletions are marked with - . </span> </td> <td> <span> Additions are marked with +. </span> </td> </tr> <tr> <td> Line 1: </td> <td> Line 1: </td> </tr> <tr> <td> </td> <td> <span>+ Go to the [http://digitalhistoryhacks.blogspot.com/2006/01/who-is-in-dictionary-of-canadian.html original post]<br> + Go back to the ["DHH Archive"]<br> + <br> + {{{<br> + # dcbo-ids-cloud.pl<br> + # 15 jan 2006<br> + #<br> + # wj turkel<br> + # http://digitalhistoryhacks.blogspot.com<br> + #<br> + # Goes to the online Dictionary of Canadian Biography to get the<br> + # number of people in each category ('Aboriginal', 'Accountant', etc.)<br> + # Outputs a tag cloud as HTML.<br> + #<br> + # LWP code adapted from<br> + # Burke, Perl &amp; LWP (O'Reilly 2002), pp. 27-28, 96-97.<br> + # Tag cloud adapted from<br> + # Bausch, Yahoo! Hacks (O'Reilly 2006), pp. 203-04.<br> + # Max subroutine from<br> + # Schwartz &amp; Phoenix, Learning Perl, 3rd ed (O'Reilly 2001), pp. 65.<br> + <br> + use LWP;<br> + use LWP::Simple;<br> + use POSIX "floor";<br> + <br> + sub max {<br> + my($max_so_far) = shift @_;<br> + foreach(@_) {<br> + if ($_ &gt; $max_so_far) {<br> + $max_so_far = $_;<br> + }<br> + }<br> + $max_so_far;<br> + }<br> + <br> + my $browser;<br> + sub do_POST {<br> + $browser = LWP::UserAgent-&gt;new() unless $browser;<br> + my $resp = $browser-&gt;post(@_);<br> + return ($resp-&gt;content, $resp-&gt;status_line, $resp-&gt;is_success, $resp)<br> + if wantarray;<br> + return unless $resp-&gt;is_success;<br> + return $resp-&gt;content;<br> + }<br> + <br> + my $doc_url = 'http://www.biographi.ca/EN/Search.asp';<br> + <br> + # Create a hash of category IDs and names by scraping base page.<br> + my %categories = ();<br> + my $document = get($doc_url);<br> + while ($document =~ m/&lt;A CLASS="NormalLink" HREF="Javascript:fSubmit\('','','','','([0-9]+)','1','','','','','',''\);"&gt;(.*?)&lt;\/A&gt;/g) {<br> + my ($id, $tmp) = ($1, $2);<br> + $tmp =~ s/&lt;IMG(.*?)&gt;//;<br> + $categories{$id} = $tmp;<br> + }<br> + <br> + # We need to keep the category keys in the right order.<br> + my @catarray = sort {$categories{$a} cmp $categories{$b}} (keys %categories);<br> + <br> + # For each different category, return count of matching biographies.<br> + my %categorycount = ();<br> + foreach my $id (@catarray) {<br> + my ($content, $message, $is_success) = do_POST(<br> + $doc_url,<br> + [ 'Data3' =&gt; $id,<br> + 'Data4' =&gt; '1' ],<br> + );<br> + die "Error $message\n"<br> + unless $is_success;<br> + $content =~ m{&lt;TD CLASS="STANTXT"&gt;&lt;B&gt;([0-9,]+)&lt;/B&gt; biography\(ies\) are available using your current search criteria}is;<br> + my $tmp = $1;<br> + $tmp =~ s/\,//;<br> + $categorycount{$id} = $tmp;<br> + # Be considerate to their server.<br> + sleep 2;<br> + }<br> + <br> + # Debugging scaffolding: check this output against tag cloud.<br> + print "\n----------------------------\n";<br> + foreach my $key (@catarray) {<br> + print "key: " . $key . "\tcat: " . $categories{$key} . "\tcnt: " . $categorycount{$key} . "\n";<br> + }<br> + print "\n----------------------------\n";<br> + <br> + # Now we send the tag cloud to an HTML file.<br> + open(OUTPUT, "&gt;ID-cloud.html");<br> + <br> + # Range of font sizes to use.<br> + my $minfontsize = 12;<br> + my $maxfontsize = 36;<br> + <br> + # Get the maximum number of biographies in any category.<br> + my $maxbio = &amp;max(values %categorycount);<br> + <br> + # Output the opening HTML tags.<br> + print OUTPUT "&lt;html&gt;\n&lt;head&gt;\n&lt;style&gt;\nbody {\n";<br> + print OUTPUT "\tbackground-color:#fff;\n\tfont-family:Tahoma, Verdana, Arial;\n";<br> + print OUTPUT "\tcolor:#354251;\n}\n.tag{\n\tmargin-bottom: 10px;\n\tpadding: 5px;\n}\n";<br> + print OUTPUT "&lt;/style&gt;\n&lt;/head&gt;\n&lt;body&gt;\n";<br> + print OUTPUT "&lt;table border=1px width=80% cellpadding=4px align=center&gt;\n&lt;tr&gt;\n&lt;td&gt;\n";<br> + <br> + # Print the name of each category in the appropriate sized font.<br> + foreach my $catid (@catarray) {<br> + my $fontsize = $minfontsize + floor(($maxfontsize-$minfontsize) * ($categorycount{$catid}/$maxbio));<br> + print OUTPUT "&lt;span class=\'tag\' style=\'font-size:" . $fontsize . "px\'&gt;" . $categories{$catid} . "&lt;/span&gt;\n";<br> + }<br> + <br> + # Output the closing HTML tags.<br> + print OUTPUT "&lt;/td&gt;&lt;/tr&gt;&lt;/table&gt;\n&lt;/body&gt;\n&lt;/html&gt;\n";<br> + <br> + print "Finished processing.\n";<br> + }}}</span> </td> </tr> </table> </div>