HIP subject keywords

#!/usr/bin/perl -w use strict; use DBI; #### # # (cc) 2005 http://creativecommons.org/licenses/by-nc-sa/2.5/ # # Create a "tag" list of subject keywords from Horizon with links to your HIP # # You'll need to tweak the first section of variables to suit: # # $odbcConnection --> the name of your ODBC database connection # $odbcUsername --> your ODBC user name # $odbcPassword --> your ODBC password # # $minimumBibs --> the minimum number of bibs to match in the SQL query # $threshold --> the minimum number of total matches for inclusion in the HTML output # $spacing --> a character string to insert after each tag # # $hipUrl --> the link to your HIP (ideally using the Subject Alphabetical index) # # $outputFile --> file to output the HTML to # # # # You'll also need a "config.txt" file to define the boundaries between each text size # # See the following web page for more info: # # http://www.daveyp.com/blog/index.php/archives/47/ # # This script also makes use of a chunk of scripting from this page: # # http://www.daveyp.com/blog/index.php/archives/29/ # #### my $odbcConnection = 'REPORTSMITH'; my $odbcUsername = 'roger'; my $odbcPassword = '123'; my $minimumBibs = 5; my $threshold = 250; my $spacing = ' ...'; my $hipUrl = qq(http://webcat.hud.ac.uk/ipac20/ipac.jsp?index=SUBJECT&term=); my $outputFile = 'subjects.html'; my $dbh = DBI->connect( "dbi:ODBC:$odbcConnection", $odbcUsername, $odbcPassword, { RaiseError => 1 }); my $sql = qq( select n_bibs,processed,reconst from subject where n_bibs > $minimumBibs order by n_bibs ); my $sth = $dbh->prepare( $sql ); $sth->execute( ); my $prev = ''; my @row = ( ); my @size = ( ); my @check = ( ); my @colour = ( ); my %subjectCount = ( ); my %real = ( ); my %list = ( ); open( IN, "config.txt" ) || die "unable to open config.txt"; my $range = 0 ; while( ) { s/\t/ /g; s/ */ /g; s/[\r\n]//g; my( $count, $size, $colour ) = split( / / ); $range++; $size[$range] = $size; $check[$range] = $count; $colour[$range] = $colour; } close( IN ); while ( @row = $sth->fetchrow_array ) { my $count = $row[0]; my $processed = $row[1]; my $reconst = $row[2]; my $subject = reconstructTitle( $processed, $reconst ); if( $subject =~ / *\(/ && $subject !~ / *\-\-? / ) { my( $pre, $aft ) = split( / *\(/, $subject, 2 ); if( $subject ne $pre ) { $processed = $pre; } $real{$processed} = $pre; } elsif( $subject =~ / *\-\- / ) { my( $pre, $aft ) = split( / *\-\- /, $subject, 2 ); if( $subject ne $pre ) { $processed = $pre; } $real{$processed} = $pre; } elsif( $subject =~ / *\- / ) { my( $pre, $aft ) = split( / *\- /, $subject, 2 ); if( $subject ne $pre ) { $processed = $pre; } $real{$processed} = $pre; } elsif( $subject =~ / *\(/ ) { my( $pre, $aft ) = split( / *\(/, $subject, 2 ); if( $subject ne $pre ) { $processed = $pre; } $real{$processed} = $pre; } else { $real{$processed} = $subject; } $subjectCount{$processed} += $count; if( $list{$processed} ) { $list{$processed} .= " / ".$subject; } else { $list{$processed} .= $subject; } } open( OUTPUT, ">$outputFile" ); print OUTPUT qq(HIP subject keywords\n); foreach my $v ( sort keys %subjectCount ) { if( $subjectCount{$v} < $threshold ) { next } my $x = $subjectCount{$v} - $threshold; my $css = 1; foreach my $l ( 1 .. $range ) { if( $x > $check[$l] ) { $css = $l } } $css = 's'.$css; if( length( $list{$v} ) > 500 ) { $list{$v} = substr($list{$v}, 0, 500).'...' } if( substr( lc($real{$v}), 0, 1 ) ne $prev ) { $prev = substr( lc($real{$v}), 0, 1 ); # print "

\n"; } print OUTPUT qq($real{$v}$spacing\n); } print OUTPUT qq(\n); close( OUTPUT ); sub reconstructTitle { my $title = shift; my $recon = shift; my $final = $title; my $len = length( $recon ); my $pos = 0; ### IF RECONSTRUCT IS EMPTY, WE DON'T NEED TO DO ANYTHING... if( $recon =~ /^\s*$/ ) { return( $final ) } ### BEGIN CRAWLING THROUGH RECONSTRUCT... while( $pos < $len ) { $pos++; my $char = substr( $recon, $pos, 1 ); my $before = substr( $recon, $pos-1, 1 ); my $after = substr( $recon, $pos+1, 1 ); if( $char eq 'M' ) { my $after_string = substr( $recon, $pos + 2, ord( substr( $recon, $pos+1, 1 ) ) ); if( length( $after_string ) < ord( substr( $recon, $pos+1, 1 ) ) ) { $after_string = $after_string . " " } $final = substr( $final, 0, ord($before) -1 ) . $after_string . substr( $final, ord($before)-1, 255 ); $pos = $pos + ord( substr( $recon, $pos+1, 1 ) ); $pos = $pos + 2; } elsif( $char eq 'I' ) { my $after_string = $after; $final = substr( $final, 0, ord($before) -1 ) . $after_string . substr( $final, ord($before)-1, 255 ); $pos = $pos + 2; } elsif( $char eq '2' ) { my $after_string = substr( $recon, $pos+1, 1 ); $final = substr( $final, 0, ord($before) -1 ) . $after_string . substr( $final, ord($before)+1, 255 ); $pos = $pos + 2; } elsif( $char eq 'R' ) { my $after_string = substr( $recon, $pos+1, 1 ); $final = substr( $final, 0, ord($before) -1 ) . $after_string . substr( $final, ord($before), 255 ); $pos = $pos + 2; } elsif( $char eq 'P' ) { my $after_string = substr( $recon, $pos+1, 2 ); $final = substr( $final, 0, ord($before) -1 ) . $after_string . substr( $final, ord($before), 255 ); $pos = $pos + 3; } elsif( $char eq 'D' ) { $final = substr( $final, 0, ord($before) -1 ) . substr( $final, ( ord($before)-1 + ord($after) ), 255 ); $pos = $pos + 2; } elsif( $char eq 'E' ) { $final = substr( $final, 0, ord($before) -1 ) . substr( $final, ( ord($before) ), 255 ); $pos = $pos + 1; } ####### NOT SURE IF THIS IS RIGHT, BUT IT WORKS! elsif( $char eq 'C' ) { my $string = substr( $recon, $pos + 1, 255 ); $final = substr( $final, 0, ord($before) -1 ) . $string; $pos = $pos + 1000; } ####### GIVE UP IF IT'S AN UNKNOWN COMMAND... else { die "unknown $char at $pos\n" } } ### FINAL TIDY UP FOR DISPLAY ON WEB PAGE... $final =~ s/(\w) \/$/$1./gi; $final =~ s/ \/$//g; $final =~ s/ \:/\:/g; $final =~ tr/\000-\177//cd; $final =~ s/\.$//g; return($final); }