#!/usr/bin/perl -w
use strict;
use DBI;
####
#
# (cc) 2005 http://creativecommons.org/licenses/by-nc-sa/2.5/
#
# Create a "tag" list of subject keywords from Horizon with links to your HIP
#
# You'll need to tweak the first section of variables to suit:
#
# $odbcConnection --> the name of your ODBC database connection
# $odbcUsername --> your ODBC user name
# $odbcPassword --> your ODBC password
#
# $minimumBibs --> the minimum number of bibs to match in the SQL query
# $threshold --> the minimum number of total matches for inclusion in the HTML output
# $spacing --> a character string to insert after each tag
#
# $hipUrl --> the link to your HIP (ideally using the Subject Alphabetical index)
#
# $outputFile --> file to output the HTML to
#
#
#
# You'll also need a "config.txt" file to define the boundaries between each text size
#
# See the following web page for more info:
#
# http://www.daveyp.com/blog/index.php/archives/47/
#
# This script also makes use of a chunk of scripting from this page:
#
# http://www.daveyp.com/blog/index.php/archives/29/
#
####
my $odbcConnection = 'REPORTSMITH';
my $odbcUsername = 'roger';
my $odbcPassword = '123';
my $minimumBibs = 5;
my $threshold = 250;
my $spacing = ' ...';
my $hipUrl = qq(http://webcat.hud.ac.uk/ipac20/ipac.jsp?index=SUBJECT&term=);
my $outputFile = 'subjects.html';
my $dbh = DBI->connect( "dbi:ODBC:$odbcConnection", $odbcUsername, $odbcPassword, { RaiseError => 1 });
my $sql = qq( select n_bibs,processed,reconst from subject where n_bibs > $minimumBibs order by n_bibs );
my $sth = $dbh->prepare( $sql );
$sth->execute( );
my $prev = '';
my @row = ( );
my @size = ( );
my @check = ( );
my @colour = ( );
my %subjectCount = ( );
my %real = ( );
my %list = ( );
open( IN, "config.txt" ) || die "unable to open config.txt";
my $range = 0 ;
while( )
{
s/\t/ /g;
s/ */ /g;
s/[\r\n]//g;
my( $count, $size, $colour ) = split( / / );
$range++;
$size[$range] = $size;
$check[$range] = $count;
$colour[$range] = $colour;
}
close( IN );
while ( @row = $sth->fetchrow_array )
{
my $count = $row[0];
my $processed = $row[1];
my $reconst = $row[2];
my $subject = reconstructTitle( $processed, $reconst );
if( $subject =~ / *\(/ && $subject !~ / *\-\-? / )
{
my( $pre, $aft ) = split( / *\(/, $subject, 2 );
if( $subject ne $pre )
{
$processed = $pre;
}
$real{$processed} = $pre;
}
elsif( $subject =~ / *\-\- / )
{
my( $pre, $aft ) = split( / *\-\- /, $subject, 2 );
if( $subject ne $pre )
{
$processed = $pre;
}
$real{$processed} = $pre;
}
elsif( $subject =~ / *\- / )
{
my( $pre, $aft ) = split( / *\- /, $subject, 2 );
if( $subject ne $pre )
{
$processed = $pre;
}
$real{$processed} = $pre;
}
elsif( $subject =~ / *\(/ )
{
my( $pre, $aft ) = split( / *\(/, $subject, 2 );
if( $subject ne $pre )
{
$processed = $pre;
}
$real{$processed} = $pre;
}
else
{
$real{$processed} = $subject;
}
$subjectCount{$processed} += $count;
if( $list{$processed} ) { $list{$processed} .= " / ".$subject; }
else { $list{$processed} .= $subject; }
}
open( OUTPUT, ">$outputFile" );
print OUTPUT qq(HIP subject keywords\n);
foreach my $v ( sort keys %subjectCount )
{
if( $subjectCount{$v} < $threshold ) { next }
my $x = $subjectCount{$v} - $threshold;
my $css = 1;
foreach my $l ( 1 .. $range )
{
if( $x > $check[$l] ) { $css = $l }
}
$css = 's'.$css;
if( length( $list{$v} ) > 500 ) { $list{$v} = substr($list{$v}, 0, 500).'...' }
if( substr( lc($real{$v}), 0, 1 ) ne $prev )
{
$prev = substr( lc($real{$v}), 0, 1 );
# print "\n";
}
print OUTPUT qq($real{$v}$spacing\n);
}
print OUTPUT qq(\n);
close( OUTPUT );
sub reconstructTitle
{
my $title = shift;
my $recon = shift;
my $final = $title;
my $len = length( $recon );
my $pos = 0;
### IF RECONSTRUCT IS EMPTY, WE DON'T NEED TO DO ANYTHING...
if( $recon =~ /^\s*$/ )
{
return( $final )
}
### BEGIN CRAWLING THROUGH RECONSTRUCT...
while( $pos < $len )
{
$pos++;
my $char = substr( $recon, $pos, 1 );
my $before = substr( $recon, $pos-1, 1 );
my $after = substr( $recon, $pos+1, 1 );
if( $char eq 'M' )
{
my $after_string = substr( $recon, $pos + 2, ord( substr( $recon, $pos+1, 1 ) ) );
if( length( $after_string ) < ord( substr( $recon, $pos+1, 1 ) ) )
{
$after_string = $after_string . " "
}
$final = substr( $final, 0, ord($before) -1 ) . $after_string . substr( $final, ord($before)-1, 255 );
$pos = $pos + ord( substr( $recon, $pos+1, 1 ) );
$pos = $pos + 2;
}
elsif( $char eq 'I' )
{
my $after_string = $after;
$final = substr( $final, 0, ord($before) -1 ) . $after_string . substr( $final, ord($before)-1, 255 );
$pos = $pos + 2;
}
elsif( $char eq '2' )
{
my $after_string = substr( $recon, $pos+1, 1 );
$final = substr( $final, 0, ord($before) -1 ) . $after_string . substr( $final, ord($before)+1, 255 );
$pos = $pos + 2;
}
elsif( $char eq 'R' )
{
my $after_string = substr( $recon, $pos+1, 1 );
$final = substr( $final, 0, ord($before) -1 ) . $after_string . substr( $final, ord($before), 255 );
$pos = $pos + 2;
}
elsif( $char eq 'P' )
{
my $after_string = substr( $recon, $pos+1, 2 );
$final = substr( $final, 0, ord($before) -1 ) . $after_string . substr( $final, ord($before), 255 );
$pos = $pos + 3;
}
elsif( $char eq 'D' )
{
$final = substr( $final, 0, ord($before) -1 ) . substr( $final, ( ord($before)-1 + ord($after) ), 255 );
$pos = $pos + 2;
}
elsif( $char eq 'E' )
{
$final = substr( $final, 0, ord($before) -1 ) . substr( $final, ( ord($before) ), 255 );
$pos = $pos + 1;
}
####### NOT SURE IF THIS IS RIGHT, BUT IT WORKS!
elsif( $char eq 'C' ) {
my $string = substr( $recon, $pos + 1, 255 );
$final = substr( $final, 0, ord($before) -1 ) . $string;
$pos = $pos + 1000;
}
####### GIVE UP IF IT'S AN UNKNOWN COMMAND...
else
{
die "unknown $char at $pos\n"
}
}
### FINAL TIDY UP FOR DISPLAY ON WEB PAGE...
$final =~ s/(\w) \/$/$1./gi;
$final =~ s/ \/$//g;
$final =~ s/ \:/\:/g;
$final =~ tr/\000-\177//cd;
$final =~ s/\.$//g;
return($final);
}