#!/usr/local/bin/./perl ########################################################################## # Created March/April 1996, Michael D. Smith # Part of WebGlimpse (GlimpseHTTP) research with Udi Manber # Glimpse mailing list: glimpse@cs.arizona.edu # WebGlimpse home page: http://glimpse.cs.arizona.edu/webglimpse # # Modified by Dachuan Zhang, May 22, 1996 # Some bugs fixed. ########################################################################## ########################################################################## ## GLOBALS ########################################################################## $archivedir = shift(@ARGV); ## Cleaner handling of options as per M. Ernst. --GB 9/24/97 @filelist=(); @THEBOX=(); @THEPAGE=(); ########################################################################## ## SETTINGS ########################################################################## # to be changed $WEBGLIMPSE_HOME = "/usr/local/webglimpse"; $CGIBIN = "cgi-bin/webglimpse"; $FULLSEARCH="webglimpse-fullsearch"; # static $HTMLFILE_RE = "((.s?html)|(.sht)|(.htm))\$"; $WEBGLIMPSE_LIB = "$WEBGLIMPSE_HOME/lib"; $BACKUPEXT=".wgbak"; $MADENH = ".wg_madenh"; $SEARCHBOX = ".wgbox.html"; $SEARCHPAGE_TEMPLATE = ".wgindex.html"; $SEARCHPAGE = "wgindex.html"; $nh_pre=".nh."; ########################################################################## ## ENTRY POINT ########################################################################## #--------------------------------- # make my libraries more important unshift(@INC, "$WEBGLIMPSE_LIB"); require "config.pl"; #--------------------------------- ## Cleaner handling of options as per M. Ernst. --GB 9/24/97 $removing = 0; $keep_nh = 0; $quiet = 0; while (defined($arg = shift(@ARGV))) { if ($arg eq "-r") { $removing = 1; } elsif ($arg eq "-k") { $keep_nh = 1; } elsif ($arg eq "-q") { $quiet = 1; } else { die "Bad argument $arg (remaining args @ARGV)"; } } ## End option handling change --GB 9/24/97 if($archivedir eq "") { $archivedir = "."; # make it current dir } # try to change the directory to indexdir $startpwd = `pwd`; $retval = chdir ($archivedir); if($retval==0){ print "Cannot change directory to $archivedir. Quitting.\n"; exit -3; } # get the 'real' path $archivepwd = `pwd`; chomp $archivepwd; # make sure it has a configuration file if($removing==0 && &TestConfig($archivepwd)==0){ print "Cannot find configuration file for archive. Quitting.\n"; exit -4; } #---------------------- $MADENH = "$archivepwd/$MADENH"; $SEARCHBOX = "$archivepwd/$SEARCHBOX"; # get the settings from the configuration file # there should be no problem opening this file -- we know it exists # read the settings # ($title, $archiveurl, $traverse_type, $numhops,$nhhops,$addboxes) = # &ReadConfig($archivepwd); ($title, $archiveurl, $traverse_type, $explicit_only, $numhops, $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, @urllist) = ReadConfig($archivepwd); ### HUH? What does this code do? Can we remove it? ### here just in case. # if we're told to do nothing, do nothing. # Removed next 4 lines by bgopal oct/11/96 #if($addboxes==1 && $removing==0){ # print "Possible problem with addsearch. Please investigate (debug msg).\n"; # exit(0); #} $archivetitle = $title; if($removing==0){ # generate the wgindex.html file from the .wgindex.html &read_search_page($SEARCHPAGE_TEMPLATE); &make_search_page($SEARCHPAGE); } # if we don't have to addboxes, just abort! if($addboxes==0) { print "No search boxes used\n"; exit(0); } # open the .wg_madenh open(FILELIST, "$MADENH") || die "Cannot open $MADENH for reading."; # get the filelist from the file @filelist = ; close FILELIST; # read the search box if($removing==0){ &read_search_box($SEARCHBOX); } # make the additions to the corefiles &add_search_box(@filelist); #---------------------- #change the dir back chdir($startpwd); ########################################################################## ### PROCEDURES ########################################################################## ########################################################################## sub read_search_page{ local($file)=@_; open(FILE, $file) || die "Cannot open $file for reading.\n"; @THEPAGE = ; close(FILE); } ########################################################################## sub make_search_page{ local($file)=@_; local($realline, $line, $newfile); open(OUTPUT,">$file") || die "Cannot open $file for writing.\n"; # output the contents of THEPAGE foreach $realline(@THEPAGE){ $line = $realline; # do the substitutions and output $line =~ s/\$ARCHIVETITLE/$archivetitle/g; $line =~ s/\$ARCHIVEURL/$archiveurl/g; $line =~ s/\$ARCHIVEPWD/$archivepwd/g; $line =~ s/\$CGIBIN/$CGIBIN/g; $line =~ s/\$FILE/$file/g; print OUTPUT $line; } close OUTPUT; } ########################################################################## sub read_search_box{ local($file)=@_; open(FILE, $file) || die "Cannot open $file for reading.\n"; @THEBOX = ; close(FILE); } ########################################################################## sub add_search_box{ local(@filelist) = @_; local($file, $tempfile,$left,$mid,$right,$did_box); # for each file foreach $file(@filelist){ chomp($file); # remove \n if exists # I can ONLY modify .html files next if ($file !~ /$HTMLFILE_RE/); # stat the file my(@statinfo) = stat($file); # Changed 9/16/97 to keep existing permissions # First we make a backup, then write into the real file. $bakfile = "$file.bak"; system("cp $file $bakfile"); # open the backup file for reading eval { open(INPUT, $bakfile); }; if ($@){ warn "Cannot open file $bakfile: $@\n"; next; } eval { open(OUTPUT, ">$file"); }; if ($@) { warn "Cannot open file $file for writing: $@\n"; system("cp $bakfile $file"); close(INPUT); next; } $did_box=0; $rid_box=0; while(){ # copy until we see either , , or if($did_box==0 && /(.*)(<\/body>|<\/html>|<\!--GH_SEARCH-->)(.*)/i){ $left = $1; $mid = $2; $right = $3; # if we have , skip through if($mid =~ /^<\!--GH_SEARCH-->$/i){ while(){ /(.*)(<\!--GH_END-->)(.*)/i || next; $right=$3; last; } ($left eq "") || print OUTPUT "$left\n"; if ($removing == 1 && $rid_box == 0) { &do_box($file); print OUTPUT "\n"; print OUTPUT "\n"; $rid_box = 1; } elsif ($removing == 1) { ; } else { &do_box($file); $did_box = 1; } ($right eq "")||print OUTPUT "$right\n"; } else { $right = "$mid$right"; ($left eq "") || print OUTPUT "$left\n"; if ($removing != 1 && $did_box == 0) { &do_box($file); $did_box = 1; } ($right eq "")||print OUTPUT "$right\n"; } # set the box var to true }else{ print OUTPUT $_; } } if($did_box==0 && $rid_box==0){ &do_box($file); } # close the two files close OUTPUT; close INPUT; ### TO DO -- don't do this # rename the original into .wgbackup # system("mv $file $file$BACKUPEXT"); # copy the temp file into the original #system("mv -f $tempfile $file"); #chmod (0644, $file); # DO NOT CHANGE THE MODE!! IT MIGHT ALREADY HAVE BEEN SET FOR A REASON!! --GB # Instead of the above we now just delete the bakup file. -GB 9/16/97 system("rm $bakfile"); # modify the access time back to the original utime(@statinfo[8], @statinfo[9], $file); } } ########################################################################## sub do_box{ local($file)=@_; local($realline, $line, $newfile); # if we're removing, try to delete the neighborhood, too if($removing==1){ # might as well kill the neighborhood if no one will refer to it! # prepend the .nh_ $newfile = $file; $newfile =~ s/([^\/]+)$/$nh_pre$1/; if (!$quiet) { print "removing $newfile\n"; } if($keep_nh==0){ unlink("$newfile"); } return; } print OUTPUT "\n"; # output the contents of THEBOX foreach $realline(@THEBOX){ $line = $realline; # do the substitutions and output $line =~ s/\$ARCHIVETITLE/$archivetitle/g; $line =~ s/\$ARCHIVEURL/$archiveurl/g; $line =~ s/\$ARCHIVEPWD/$archivepwd/g; $line =~ s/\$CGIBIN/$CGIBIN/g; $line =~ s/\$FILE/$file/g; print OUTPUT $line; } print OUTPUT "\n"; }