#!/usr/local/bin/perl

# I-Spy
#   Cycles through a set of remote FTP and web sites to discover changes
#
#   Expected inputs (all optional):
#     -d <directory> a directory with site definitions
#                    a site definition is a directory which contains
#                    the data.txt file listing the target URL
#     -l <directory> a place to save the log
#     -w <directory> where to save the HTML report
#     -m <e-mail address> to whom to send the report
#     -h whether to e-mail an HTML report instead of a plain text one
#     -n <text> overrides my name; used for log names and such
#
#   06/28/2006 Version 2.2.0b2
#   Use and distribute this script as per the Artistic License
#   Copyright (C) 2006 Igor S. Livshits <mailto:i-spy@ayradyss.org>


# Define some global constants
#
my $true= (1==1);
my $false= (1==0);
my $noError= 0;
my $dataFileName= "data.txt";
my $addedFileName= "added.txt";
my $removedFileName= "removed.txt";
my $retainedFileName= "retained.txt";
my $snapshotFileName= "snapshot.txt";
my $oldSnapshotFileName= "old-snapshot.txt";
my $preFilterScriptName= "pre-filter";
my $postFilterScriptName= "post-filter";
my $webIndexFile= "index.html";
my $logFileToken= "<!-- Log file path -->";
my $previousReportToken= "<!-- Previous report -->";
my $tocToken= "<!-- Table of Contents -->";
my $directoryDelimiter= "/";
my $specialDelimiter= ":\t";
my $myPage= "http://www.ayradyss.org/programs/current.html#i-spy";
my $prettyNameRegEx= qr/^[\+_]+/;


# Define libraries and modules
#
push (@INC, 'pwd');             # add current directory to the search list
push (@INC, $ENV{HOME}."/bin"); # add special ~/bin directory to the search list
use strict;
use Getopt::Long;               # command line options processor
use Log::File;                  # a simple log file implementation
use Date::Format;               # pretty date output


# Define some global variables
#
my $dataDirectory= undef;
my $logDirectory= undef;
my $webDirectory= undef;
my $mailTo= undef;
my $html= undef;
my $myName= undef;
my $myAgentMask= undef;
my $logFile= undef;
my %sites= ();
my %skippedSites= ();

# Parse our command line options
#
GetOptions("d|dir|directory=s" => \$dataDirectory,
           "l|log|logDirectory=s" => \$logDirectory,
           "w|web=s" => \$webDirectory,
           "m|mail=s" => \$mailTo,
           "h|html" => \$html,
           "n|name=s" => \$myName,
           "a|agent|masquarade" => \$myAgentMask,
           );
Initialize();

# Iterate through our list of sites
#
Run();


# Terminate gracefully
#
Quit();


#
# Subroutines
#


# Initialize
#   Set up and sanity check all global variables
#
sub Initialize
{
  unless ($myName)
  {                             # grab my name from the process path
    $myName= $0;
    $myName= $1 if $myName=~ /([^$directoryDelimiter]+)$/;
    $myName=~ s/\.pl$//;        # drop the perl suffix if present
  }
  $myName= "i-spy" unless $myName;
                                # configure the data directory
  $dataDirectory= "sites" unless $dataDirectory;
  $dataDirectory.= $directoryDelimiter
    unless $dataDirectory=~ /$directoryDelimiter$/;

  if ($webDirectory)
  {                             # set some defaults for web reports
    $webDirectory.= $directoryDelimiter
      unless $webDirectory=~ /$directoryDelimiter$/;
    $logDirectory= $webDirectory."logs"
      unless $logDirectory;
  }
                                # configure logs
  $logDirectory= "logs" unless $logDirectory;
  $logFile= new Log::File(directory => $logDirectory,
                          name => "$myName.log"
                          );
}


# Run
#  Do our duty
#
sub Run
{
  GatherSites();
  CheckSites() if keys(%sites);
  CompareSnapshots() if keys(%sites);
  ReportNew();
}


# Quit
#  Terminate
#
sub Quit
{
  $logFile->Close();
  exit;
}


# LogEntry
#   Record a message in the log
#
sub LogEntry
{
  return unless $logFile;

  $logFile->MakeEntry(shift || "");
}


# MajorLogEntry
#   Record a message in the log separating it from previous messages
#
sub MajorLogEntry
{
  return unless $logFile;

  $logFile->SeparateLogEntries();
  $logFile->MakeEntry(shift);
  $logFile->MakeEntry();
}


# MinorLogEntry
#   Record a message in the log indenting it from a normal entry
#
sub MinorLogEntry
{
  return unless $logFile;

  $logFile->MakeEntry("\t", shift);
}


# BlankLogEntry
#   Record a blank line in the log
#
sub BlankLogEntry
{
  return unless $logFile;

  $logFile->MakeEntry();
}


# GatherSites
#   Collect all sites into an associative array
#
sub GatherSites
{
  my $site;                     # a given remote site
  my @leaves= ();               # all our data subdirectories and files
  my $dataFilePath;             # full relative data file name

  # Scan for directories within our data directory
  # each subdirectory will represent a site
  MajorLogEntry("Gathering sites within <$dataDirectory>...");
  if (-d $dataDirectory)
  {
    if (opendir(Sites, $dataDirectory))
    {                           # scan for directories
      @leaves= readdir(Sites);
      closedir(Sites);
    }
    else
    {
      LogEntry("Failed to open <$dataDirectory>: $!");
    }
  }
  else
  {
    LogEntry("<$dataDirectory> is not a directory");
  }
    
  # Weed out inappropriate directories (hidden, disabled, or missing data)
  @leaves= grep(!/^[\.~]/, @leaves); # skip hidden

  if (@leaves)
  {
    foreach $site (@leaves)
    {
      if (!-d $dataDirectory.$site)
      {
        LogEntry("Skipping <$site>: not a directory");
        next;
      }

      $dataFilePath= "$dataDirectory$site$directoryDelimiter$dataFileName";
      if (-T $dataFilePath)
      {                         # seems alright
        if (open(URL, $dataFilePath))
        {                       # grab our URLs
          while (<URL>)
          {
            $sites{$site}.= $_;
          }
          close(URL);
          LogEntry("Grabbed <$site>");
        }
        else
        {
          LogEntry("Skipping <$site>: $!");
        }
      }
      else
      {
        LogEntry("Skipping <$site>: cannot access the data file "
                      . "<$dataFilePath>");
      }
    }
  }
  else
  {
    LogEntry("Seems that we have no inputs");
  }
}


# CheckSites
#   Peek at each of our gathered sites for changes
#
sub CheckSites
{
  my $site;                     # a given remote site
  my $url;                      # a URL
  my $directive;                #  and other info about a remote site
  my $siteDirectory;            # a given site's directory
  my $error;                    # an error condition (null for success)

  # Check each site via its designated URL
  MajorLogEntry("Checking sites...");
  foreach $site (keys %sites)
  {
    $siteDirectory= $dataDirectory.$site.$directoryDelimiter;
    ($url, $directive)= split(/\n/, $sites{$site});
    $url=~ s/\n$//g;            # kill trailing returns
    $url=~ s/^<//g;             # kill leading bracket
    $url=~ s/>$//g;             # kill trailing bracket
    $url=~ s/^URL://g;          # kill leading URL designation
    LogEntry("$site:");

    $url= $directive            # looks like a special URL
      if $directive=~ /^\w+:/;
    my ($protocol, $host, $login, $password, $path)= ParseURL($url);

    unless ($protocol and $host)
    {                           # malformed URL?
      MinorLogEntry("could not parse the URL:");
      MinorLogEntry("\tprotocol= $protocol");
      MinorLogEntry("\thost= $host");
      MinorLogEntry("\tlogin= $login");
      MinorLogEntry("\tpassword= *secret*") if $password;
      MinorLogEntry("\tpath= $path");
      return "Could not parse the URL";
    }

    if ($protocol eq "FTP")
    {                           # get a listing via FTP
      $error= CheckViaFTP($host, $login, $password, $path,
                          $siteDirectory.$snapshotFileName, $directive);
    }
    elsif ($protocol eq "HTTP")
    {                           # grab a page via HTTP
      $error= CheckViaHTTP($url, $siteDirectory.$snapshotFileName);
    }
    else
    {                           # unuspported protocol
      $error= "Unsupported protocol ($protocol)";
    }
    
    if ($error)
    {                           # clear the entry due to failure
      $skippedSites{$site}= $error; # remember the error condition
      delete($sites{$site});
      MinorLogEntry("skipping $site in future processing");
    }
    LogEntry();
  }
}


# CheckViaFTP
#   Either get a directory listing or grab a precomputed one
#   Return the error description (null for success)
#
sub CheckViaFTP
{
  use Net::FTP;                 # from libnet by Graham Barr
  my $site= shift;              # the host name of our site
  my $login= shift;             # optional login name
  my $password= shift;          # optional login password
  my $path= shift;              # optional path to a file or a directory
  my $targetFile= shift;        # where to store the listing
  my $directive= shift;         # any special directive (ls v. dir)
  my $ftp;                      # the FTP client object
  my $error= 0;                 # an error condition (null for success)

  $ftp= Net::FTP->new($site);
  if ($ftp)
  {
    MinorLogEntry("contacting <$site>");
    MinorLogEntry("with login name <$login>") if $login;
    MinorLogEntry("and password *secret*") if $login and $password;

    if ($ftp->login($login ? $login : undef, $password ? $password : undef))
    {                           # successfully logged in
      if (($path eq "") or ($path=~ /$directoryDelimiter$/))
      {                         # URL points to a directory
        my @listing;            # to contain our directory listing
        if ($ftp->cwd($path))
        {                       # inhibits full paths
          if ($directive eq "dir")
          {                     # obey the directive for a verbose listing
            MinorLogEntry("getting a verbose listing of <$path>");
            @listing= $ftp->dir();
          }
          else
          {                     # obtain a terse listing
            MinorLogEntry("getting a terse listing of <$path>");
            if ($ftp->cwd($path))
            {                   # inhibits full paths
              @listing= $ftp->ls();
            }
          }
        }
        else
        {                       # hmm, could not set working directory
          @listing= ();         # clear the listing (just in case)
        }
        if (@listing)
        {
          if (open(Snapshot, ">$targetFile"))
          {                     # save the listing
            print(Snapshot join("\n", @listing), "\n");
            close(Snapshot);
          }
          else
          {
            MinorLogEntry("failed to save the listing: $!");
            $error= "Failed to save the listing ($!)";
          }
        }
        else
        {
          $error= $ftp->message(); # preserve the returned error message
          $error=~ s/\s+$//s;   # kill trailing white spaces
          $error=~ s/\n/\t/sg;  # make the message a single line
          MinorLogEntry("failed to grab the listing: $error" );
          $error= "Failed to grab the listing ($error)";
        }
      }
      else
      {                         # URL points to a file
        MinorLogEntry("getting the file <$path>");
        my $suffix= "";
        if ($path=~ /(\.\w+)$/)
        {                       # check for a suffix
          $suffix= $1;
          $targetFile.= $suffix # preserve the original suffix if different
            unless $targetFile=~ /$suffix$/;
        }
        my $localFile= $ftp->get($path, $targetFile);
        if ($localFile ne $targetFile)
        {                       # failed to get the file
          $error= $ftp->message(); # preserve the returned error message
          $error=~ s/\s+$//s;   # kill trailing white spaces
          $error=~ s/\n/\t/sg;  # make the message a single line
          $error= 
          MinorLogEntry("failed to grab the file <$path>: $error");
          $error= "Failed to grab the file ($error)";
        }
        elsif (($suffix eq ".Z") or ($suffix eq ".gz"))
        {                       # decompress the fetched file with gunzip
          if (system("gunzip --force $localFile"))
          {
            MinorLogEntry("failed to decompress the file <$localFile>: $!");
            $error= "Failed to decompress the file ($!)";
          }
        }
      }
    }
    else
    {                           # failed to log in
      $error= $ftp->message();  # preserve the returned error message
      $error=~ s/\s+$//s;       # kill trailing white spaces
      $error=~ s/\n/\t/sg;      # make the message a single line
      MinorLogEntry("failed to log in: $error" );
      $error= "Failed to log in ($error)";
    }
    $ftp->quit();
  }
  else
  {                             # failed to open a connection
    MinorLogEntry("failed to contact <$site>: $@");
    $error= "Failed to contact <$site> ($@)";
  }

  return $error;
}


# CheckViaHTTP
#   Grab a remote web page
#
sub CheckViaHTTP
{
  use LWP::UserAgent;           # from libwww-perl by Gisle Aas
  use HTML::TokeParser;         # fom HTML-Parser by Gisle Aas
  my $url= shift;               # the target URL
  my $targetFile= shift;        # local storage for the remote web page
  my $agent=                    # create our user agent
    new LWP::UserAgent;
  
  MinorLogEntry("grabbing the web page <$url>");
  $agent->agent($myAgentMask) if $myAgentMask;
  my $response=                 # issue our request
    $agent->request(new HTTP::Request(GET => $url), $targetFile);

  if ($response->is_success())
  {
    # let filters reduce web pages
  }
  else
  {                             # error getting the page
    if (open(Error, ">$targetFile"))
    {                           # save the error instead of expected data
      print Error $response->error_as_HTML();
      close(Error);
      my $parser=               # create our parser
        new HTML::TokeParser($targetFile);
      if ($parser->get_tag("body"))
      {                         # grab the remote server error message
        my $error= $parser->get_trimmed_text("/body");
        MinorLogEntry("remote server reported an error <$url>: $error");
        return $error;
      }
      else
      {                         # blank remote server server
        MinorLogEntry("remote server reported an unspecified error <$url>");
        return "The remote server returned an unspecified error";
      }
    }
    else
    {                           # could not save the error
      MinorLogEntry("could not save the remote server error "
                    . "<$targetFile>: $!");
      return "Could not save the remote server error ($!)";
    }
  }

  return $noError;              # everything is fine if we got here
}


# ParseURL
#   Separate a URL into its components
#
sub ParseURL
{
  my $url= shift;
  $url=~ /^(\w+):\/*([^\/]+)\/(.*)$/; # chunk the URL components
  
  my $protocol= uc($1);         # preserve matches in real variables
  my $site= $2;
  my $path= $3;
  my $login= "";
  my $password= "";

  if ($site=~ /\@/)
  {                             # check for the optional login field
    ($login, $site)= split("\@", $site);
    ($login, $password)=        # check for the optional password field
      split(":", $login);
  }

  return($protocol, $site, $login, $password, $path);
}


# CompareSnapshots
#   Compare previous and current snapshots to find new and removed items
#
sub CompareSnapshots
{
  my $site;                     # a given remote site
  my $siteDirectory;            # a given site's directory
  my @old;                      # lines of our snapshot files
  my @new;                      # lines of our snapshot files
  my $newItem;                  # individual lines from our snapshots
  my $oldItem;                  # individual lines from our snapshots
  my $error= 0;                 # an error condition (null for success)

  # Compare previous and current snapshots for each site
  MajorLogEntry("Comparing sites...");
  foreach $site (keys %sites)
  {    
    my @added= ();              # processed lines of our snapshot files
    my @retained= ();           # the scope change should take care of clearing
    my @removed= ();            # these for each site iteration

    $siteDirectory= $dataDirectory.$site.$directoryDelimiter;
    LogEntry("$site:");

    if (-x $siteDirectory.$preFilterScriptName)
    {                           # filter the raw result
      MinorLogEntry("pre-filtering current snapshot");
      if ($error= FilterResults($siteDirectory.$preFilterScriptName,
                                $siteDirectory.$snapshotFileName))
      {                         # filter returned an error
        $skippedSites{$site}= $error; # remember the error condition
        delete($sites{$site});
        MinorLogEntry("skipping <$site>");
        next;
      }
    }

    if (open(New, $siteDirectory.$snapshotFileName))
    {
      @new= sort <New>;         # read and sort the entire snapshot
      close(New);
    }
    else
    {                           # could not read the new snapshot
      MinorLogEntry("could not read the new snapshot <"
                    . $siteDirectory.$snapshotFileName . ">: $!");
      MinorLogEntry("skipping to the next site...");
      # remember the error condition
      $skippedSites{$site}= "Could not read the new snapshot ($!)";
      delete($sites{$site});
      next;                     # skip this site
    }

    if (open(Old, $siteDirectory.$oldSnapshotFileName))
    {
      @old= sort <Old>;         # read and sort the entire snapshot
      close(Old);
    }
    else
    {                           # could not read the old snapshot
      @old= ();                 # just use a blank for comparison
      MinorLogEntry("could not read a previous snapshot <"
                    . $siteDirectory.$oldSnapshotFileName . ">: $!");
    }
    
    $newItem= shift(@new);      # pre-fetch initial entries
    $oldItem= shift(@old);
    while ($oldItem and $newItem)
    {                           # check each pair of items
      if ($newItem lt $oldItem)
      {                         # this item appears only in the new snapshot
        push(@added, $newItem);
        $newItem= shift(@new);
      }
      elsif ($oldItem eq $newItem)
      {                         # identical items
        push(@retained, $oldItem);
        $newItem= shift(@new);
        $oldItem= shift(@old);
      }
      else
      {                         # this item appears only in the old snapshot
        push(@removed, $oldItem);
        $oldItem= shift(@old);
      }
    }
    push(@removed, $oldItem)    # preserve the current old item, if any
      if $oldItem;
    push(@removed, @old)        # preserve any residual old items, if any
      if @old;
    push(@added, $newItem)      # preserve the current new item, if any
      if $newItem;
    push(@added, @new)          # preserve any residual new items, if any
      if @new;

    MinorLogEntry("saving results");
    if (open(Added, ">$siteDirectory$addedFileName"))
    {
      print Added @added;       # flush added lines to disk
      close(Added);
    }
    else
    {                           # could not write added lines to disk
      MinorLogEntry("could not save added lines to disk <"
                    . $siteDirectory.$addedFileName . ">: $!");
      MinorLogEntry("skipping to the next site...");
      # remember the error condition
      $skippedSites{$site}= "Could not save added lines to disk ($!)";
      delete($sites{$site});
      next;                     # skip this site
    }
    if (open(Retained, ">$siteDirectory$retainedFileName"))
    {
      print Retained @retained; # flush retained lines to disk
      close(Retained);
    }
    else
    {                           # could not write retained lines to disk
      MinorLogEntry("could not save retained lines to disk <"
                    . $siteDirectory.$retainedFileName . ">: $!");
    }
    if (open(Removed, ">$siteDirectory$removedFileName"))
    {
      print Removed @removed;   # flush removed lines to disk
      close(Removed);
    }
    else
    {                           # could not write removed lines to disk
      MinorLogEntry("could not save removed lines to disk <"
                    . $siteDirectory.$removedFileName . ">: $!");
    }

    if (-x $siteDirectory.$postFilterScriptName)
    {                           # filter the new items result
      MinorLogEntry("filtering new items");
      if ($error= FilterResults($siteDirectory.$postFilterScriptName,
                                $siteDirectory.$addedFileName))
      {                         # filter returned an error
        $skippedSites{$site}= $error; # remember the error condition
        delete($sites{$site});
        MinorLogEntry("skipping <$site>");
      }
    }
    LogEntry();
  }
}


# ReportNew
#   Report newly discovered items
#
sub ReportNew
{
  my $success;                  # status returned by the protocol handlers

  if ($success= ($mailTo or $webDirectory))
  {
    MajorLogEntry("Reporting results...");
    $success= ReportNewViaMail($mailTo) if $mailTo;
    $success&&= ReportNewViaWeb($webDirectory) if $webDirectory;
    PreserveSnapshots() if $success;
  }
  else
  {
    MajorLogEntry("No report requested");
  }
}


# ReportNewViaMail
#   Report new items via e-mail
#
sub ReportNewViaMail
{
  use Net::SMTP;                # from libnet by Graham Barr
  my $to= shift;                # to whom I'll send the report
  my $user;                     # the user name part ofthe e-mail address
  my $domain;                   # the domain part of the e-mail address
  my $message;                  # the body of our report
  my $mailer;                   # allocate our mailer
  my $from;                     # our From: address

  LogEntry("Sending the report to <$to>");

  ($user, $domain)= split("\@", $to);
  unless ($domain)
  {                             # if we cannot figure out the remote domain
    $domain= "localhost";       # use the local mailer
    $to= $user . "\@" . $domain;
  }

  unless ($mailer= new Net::SMTP($domain))
  {                             # confirm our mailer
    MinorLogEntry("could not create a mail connection ($@)" .
                  "\n\tmake sure that you properly configured libnet");
    return $false;
  }

  if ($from= $ENV{'USER'} || $ENV{'LOGNAME'} || getlogin || (getpwuid($>))[0])
  {                             # set up our return address
    $from.= "\@" . $mailer->domain() if $mailer->domain();
  }
  else
  {                             # cannot learn our return address
    MinorLogEntry("could not learn my From: address");
    return $false;
  }
  $message= "To: $to\n" .       # add headers to our message
    "From: $myName <$from>\n" .
    time2str("Date: %a, %b %e %T %Y %z\n", time()) .
    "Subject: $myName report\n";

  $message.=                    # add content headers to support HTML
    "Content-Type: text/html\n" if $html;
  $message.=                    # now, append our report
    "\n" . CompileNew($html);

  if ($mailer->mail($from) && $mailer->to($to))
  {                             # negotiate envelope and send the report
    $mailer->data();
    $mailer->datasend($message);
    $mailer->dataend();
    $mailer->quit();
  }
  else
  {                             # could not send the report
    my $error= $mailer->message();
    $error=~ s/\s+$//s;         # kill trailing white spaces
    $error=~ s/\n/ /sg;         # make the message a single line
    MinorLogEntry("could not send the report ($error)");
    return $false;
  }

  return $true;
}


# ReportNewViaWeb
#   Report new items by generating a web page
#
sub ReportNewViaWeb
{
  my $directory= shift;         # where I'll store the report
  my $indexFile=                # the index file for our web directory
    $directory.$webIndexFile;
  my $pageName=                 # create a semi-unique page name
    $myName . "." . Now($true) . ".html";

  LogEntry("Saving the report as <$directory$pageName>");
  my $page =                    # the body and TOC of our report
    CompileNew($true, $true);

  my $log= LogReference();      # insert references to the log
  $page=~ s/$logFileToken/$log/sg;

  if (-l $indexFile)
  {                             # it's a symbolic link; see where it points
    my $link= readlink($indexFile);
    if ($link=~ /$directoryDelimiter/)
    {                           # hmm, must not be ours
      MinorLogEntry("previous link invalid <$link>");
    }
    else
    {                           # update the link
      if (unlink($indexFile))
      {                         # delete the old, create a new one
        MinorLogEntry("could not create a new index link " . "<$indexFile>: $!")
          unless symlink($pageName, $indexFile);
        $link=                  # construct a proper reference
          PreviousReportReference($link);
        $page=~                 # insert it in the report
          s/$previousReportToken/$link/sg;
      }
      else
      {                 # could not delete the old link
        MinorLogEntry("could not remove the old index link " .
                      "<$indexFile>: $!");
      }
    }
  }
  elsif (-e $indexFile)
  {                             # current index is not a link
    MinorLogEntry("current index is not a symbolic link -- ignoring");
  }
  else
  {                             # no index there now, set one up
    MinorLogEntry("could not create a new index link ". "<$indexFile>: $!")
      unless symlink($pageName, $indexFile);
  }

  $pageName= $directory.$pageName;
  if (open(Output, ">$pageName"))
  {                             # store our page
    print Output $page;
    close(Output);
    MinorLogEntry("could not change access for <$pageName>: $!")
      unless chmod(0644, $pageName);
  }
  else
  {                             # could not read added lines from disk
    MinorLogEntry("could not save the report: $!");
    return $false;
  }

  return $true;
}


# CompileNew
#   Compile a report of new items
#
sub CompileNew
{
  my $html= shift;              # should I generate plain text or HTML?
  my $toc= shift;               # should I generate a table of contents?
  my $site;                     # a given remote site
  my $prettySiteName;           # a given site name stripped of leading order enforcers
  my $siteDirectory;            # a given site's directory
  my $report;                   # the compiled report
  my @new;                      # a list of new items for each site
  my @candidates;               # a list of candidates for a given category
  my $error= 0;                 # an error condition (null for success)
  my $url= "";                  # URL of a given site
  my ($timeTag, $dateTag)= Now();

  # Gather new lines
  if ($html)
  {                             # create an HTML header
    MinorLogEntry("will format this report as HTML");

    $report= "<html>\n<head>\n"
      . "\t<title>New Items</title>\n\n"
      . "\t<!-- Report generated at $timeTag on $dateTag  -->\n"
      . "\t<!-- by i-spy, a perl script written by Igor S. Livshits "
      . "<mailto:i-spy\@ayradyss.org> -->\n"
      . "\t<!-- <
      . "current.html#i-spy">http://www.ayradyss.org/programs/"
      . "current.html#i-spy> -->\n"
      . "<style type=\"text/css\">"
      . "<!--"
      . "body {"
      . "color: #000000;"
      . "background-color: #d0d0d0;"
      . "} /* default */"
      . "a {"
      . "color: inherit;"
      . "background-color: inherit;"
      . "font: inherit;"
      . "text-decoration: inherit;"
      . "}"
      . "a:hover {"
      . "text-decoration: underline;"
      . "}"
      . "a:link { color: blue }"
      . "a:visited { color: purple }"
      . "a:active { color: red }"
      . "-->"
      . "</style>"
      . "</head>\n\n"
      . "<body>\n";
  }
  else
  {                             # create a text header
    MinorLogEntry("will format this report as plain text");

    $report= "";                # nothing yet
  }

  #if (@candidates= sort { lc($a) cmp lc($b) } (keys(%sites)))
  if (@candidates= sort (keys(%sites)))
  {                             # looks like we have some new items
    LogEntry("\nListing new items for each site:");
    if ($html)
    {                           # list sites with new items as HTML 
      $report.= "<center><h3>New Items</h3></center>\n";
      if ($toc)
      {                         # prepare a table of contents for new items
        $toc= "$tocToken\n"; 
        $report.= $toc;
      }
    }
    else
    {                           # list sites with new items as plain text
      $report.= "New items\n---------\n";
    }

    foreach $site (@candidates)
    {
      $prettySiteName= $site;
      $prettySiteName=~ s/$prettyNameRegEx//;
      $siteDirectory= $dataDirectory.$site.$directoryDelimiter;
      MinorLogEntry($site);

      ($url)= split(/\n/, $sites{$site});
      $url=~ /^(\w+):/;         # grab the protocol part
      my $protocol= uc($1);
      if (open(New, $siteDirectory.$addedFileName))
      {
        @new= <New>;            # list all new items
        close(New);
        
        if (@new)
        {
          if ($html)
          {                     # new items HTML list start
            $report.= "<p><ul><h4><a name=\"$prettySiteName\">" .
              "<a href=\"$url\">$prettySiteName</a></a></h4>\n";
            $toc.= "\t<li><a href=\"\#$prettySiteName\">$prettySiteName</a>\n"
              if $toc;          # add a table of contents entry
          }
          else
          {                     # new items text list start
            $report.= "\n$prettySiteName:\n\n";
          }
          foreach (@new)
          {
            s/\s+$//;           # kill trailing white spaces
            my ($link, $address)= split(/$specialDelimiter/);
            $address= $link     # the link text is also the path
              unless $address;
            if ($address=~ /^\//)
            {                   # absolute path
              $url=~ /^(\w+:\/*[^\/]+)/; # grab the URL protocol and host
              $address=         # construct a complete URL
                $1 . $address;
            }
            elsif ($address=~ /^\w+:/)
            {                   # a complete URL
              # do nothing
            }
            elsif (($address=~ /^\#/) and ($protocol eq "HTTP"))
            {                   # a name reference for HTTP URLs
              if ($url=~ /^(.+)\#[^\#]*$/)
              {                 # replace the name token
                $address= $1 . $address;
              }
              else
              {                 # add the name token
                $address= $url . $address;
              }
            }
            else
            {                   # relative path
              if ($protocol eq "HTTP")
              {                 # simplify the path
                my $urlRoot= $url;
                $urlRoot=~ s/\/[^\/]*$//; # drop the top leaf
                while ($address=~ s/^\.\.\///)
                {               # move up a level
                  $urlRoot=~ s/\/[^\/]*$//;
                }               # reconstitute absolute URL from parts
                $address= $urlRoot . "/" . $address;
              }
              else
              {                 # use as is to construct a complete URL
                $address= $url . $address;
              }
            }

            if ($html)
            {                   # new items HTML list item
              $report.= "\t<li><a href=\"$address\">$link</a>\n";
            }
            else
            {                   # new items text list item
              $report.= "$link\n\t<$address>\n";
            }
          }
          if ($html)
          {                     # new items HTML list end
            $report.= "</ul></p>\n<br>\n\n";
          }
          else
          {                     # new items text list end
            $report.= "\n";
          }
        }
        else
        {
          if ($html)
          {                     # no new items reported as HTML
            $report.= "<p><b>$myName</b> "
              . "did not discover any new items at "
              . "<a href=\"$url\">$prettySiteName</a>.</p>\n\n";
          }
          else
          {                     # no new items reported as plain text
            $report.= "\n$myName did not discover any new items at $prettySiteName.\n";
          }
        }
      }
      else
      {                         # could not read added lines from disk
        MinorLogEntry("could not read added lines from disk <"
                      . $siteDirectory.$addedFileName . ">: $!");
        MinorLogEntry("skipping to the next site...");
        # remember the error condition
        $skippedSites{$site}= "Could not read added lines from disk ($!)";
        delete($sites{$site});
        next;                   # skip this site
      }
    }
    $toc=                       # delimit a table of contents for new items
      "<ol>\n$toc</ol>\n<hr width=\"50%\">\n"
        if $toc and $toc=~ /<li>/; # if we have TOC items
  }
  else
  {                             # no new items to report, indicate this
    if ($html)
    {                           # list skipped sites as HTML
      $report.= "<center><h3>No New Items</h3></center>\n";
    }
    else
    {                           # list skipped sites as plain text
      $report.= "No new items\n";
    }
  }

  if (@candidates= keys(%skippedSites))
  {                             # looks like we have some new items
    LogEntry("\nListing skipped sites:");
    if ($html)
    {                           # list skipped sites as HTML
      $report.= "\n<hr>\n<center><h3>Errors</h3></center>\n";
      foreach $site (@candidates)
      {
        $prettySiteName= $site;
        $prettySiteName=~ s/$prettyNameRegEx//;
        MinorLogEntry("$site: " . $skippedSites{$site});
        $report.= "<p>Skipped <b>$prettySiteName</b>: "
          . $skippedSites{$site}
          . "</p>\n";
      }
    }
    else
    {                           # list skipped sites as plain text
      $report.= "\nErrors\n------\n";
      foreach $site (@candidates)
      {
        MinorLogEntry("$site: " . $skippedSites{$site});
        $report.= "Skipped $prettySiteName: $skippedSites{$site}\n";
      }
    }
  }

  if ($html)
  {                             # create an HTML footer
    $report.= "<hr><hr>\n$previousReportToken\n" .
      "&nbsp;$logFileToken\n" .
      "<br><p>Generated at $timeTag on $dateTag by " .
      "<a href=\"$myPage\">$myName</a></p>\n" .
      "</body>\n</html>\n";
  }
  else
  {                             # create a plain text footer
    $report.= "\nGenerated at $timeTag on $dateTag by $myName.\n";
  }

  $report=~ s/$tocToken/$toc/sg; # insert the table of contents
  return $report;
}


# FilterResults
#   Run a results file through an external filter program
#
sub FilterResults
{
  my $filterProg= shift;        # the name of the filter program
  my $sourceFile= shift;        # the path to the source file

  if (system($filterProg, $sourceFile, $dataDirectory))
  {                             # failed during filtering
    MinorLogEntry("filtering failed: $!");
    return "Filtering failed ($!)";
  }
  else
  {
    return $noError;
  }
}


# PreserveSnapshots
#   Preserve a copy of current snapshots for later processing
#
sub PreserveSnapshots
{
  my($site);                    # a given remote site
  my($siteDirectory);           # a given site's directory

  return unless keys(%sites);   # no sites left!

  # Preserve a copy of each new processed snapshot
  MajorLogEntry("Preserving site snapshots:");
  foreach $site (keys %sites)
  {
    $siteDirectory= $dataDirectory.$site.$directoryDelimiter;
    if (rename($siteDirectory.$snapshotFileName,
               $siteDirectory.$oldSnapshotFileName))
    {                           # report success
      MinorLogEntry($site);
    }
    else
    {                           # report failure
      LogEntry("Could not preserve $site snapshot: $!");
      $skippedSites{$site}= "Could not preserve snapshot ($!)";
    }
  }
}


# Now
#   Returns a time and a date tag generated from current time
#
sub Now
{
  my $terse= shift;             # # should I return a terse tag?
  my ($second, $minute, $hour, $day, $month, $year)=
    localtime(time());          # set up the time tag
  
  $month++;                     # correct for 0 based count
                                # and pad single digits
  $minute= "0".$minute if ($minute < 10);
  $second= "0".$second if ($second < 10);
  $month= "0".$month if ($month < 10);
  $day= "0".$day if ($day < 10);
  $year+= 1900;                 # add missing centuries
  
  return "$year$month$day$hour$minute$second"
    if $terse;                  # return a terse time stamp per request

  return                        # otherwise, a long version
    ("$hour:$minute:$second", "$month/$day/$year");
}


# PreviousReportReference
#   Inserts references to the previous report 
#
sub PreviousReportReference
{
  my $link= shift;              # our previous report reference

  $link= "[<a href=\"$link\">Previous report</a>]";

  return $link;
}


# LogReference
#   Inserts references to the current log 
#
sub LogReference
{
  my $log= $logFile->Path();    # reference to the current log file

  MinorLogEntry("could not change access for <$log>: $!")
      unless chmod(0644, $log);

  $log=~ s/^$webDirectory//;    # make it relative as necessary
  $log= "[<a href=\"$log\">Log</a>]";

  return $log;
}


#
# Terse help
#

__END__
=pod

=head1 NAME

i-spy -- scrape FTP and web sites for content changes

=head1 SYNOPSIS

B<i-spy> S<[B<-m> I<e-mail>]> S<[B<-d> I<directory>]>

B<i-spy> S<[B<-d> I<directory>]> S<[B<-w> I<directory>]>

(See the OPTIONS section for alternate option syntax with long option names.)

=head1 DESCRIPTION

B<i-spy> grabs and compares contents of FTP directories and web pages.
It then compiles a report and either sends it via e-mail or saves
it as a web page. You may also request both deliveries of the report.
For e-mail reports, you may request plain text or HTML.

B<i-spy> logs its activity as it chugs along. You may specify the log
directory, or B<i-spy> will try to find one automatically. For web page
reports, B<i-spy> will attempt to store the log in such a place where
it may be referenced by the report and served by the web server.

A site definition is a I<directory> which contains at least the F<data.txt>
file. This file must have the target URL as its first line. B<i-spy> 
currently deals with FTP and HTTP URLs. An optional second line may
indicate an alternate data source such as a precompiled site index
or listing (see the CPAN example). For FTP sites only, a second line may
also indicatre a directive for verbose listings (see the NTP
example). The latter may be useful for sites where the operator
updates directories and links without changing their name.

A site definition may also contain a F<pre-filter> (see the Apple example)
and a F<post-filter> (see the NTP example). B<i-spy> invokes pre-filters
before comparing snapshots, and post-filters after comparing snapshots.

B<i-spy> generates all other files you may find within site directories.

Options

=over 4

=item B<-d> I<directory>, B<--dir> I<directory>, B<--directory> I<directory>

A I<directory> contaning site definitions. If omitted, B<i-spy>
will try the present working directory.

=item B<-l> I<directory>, B<--log> I<directory>, B<--logDirectory> I<directory>

A I<directory> for the log. If omitted or improper, B<i-spy>
will try to figure out an appropriate place.

=item B<-w> I<directory>, B<--web> I<directory>

A I<directory> for  generated HTML reports; this directory should
ideally be accessible by a local or a remote web browser.

=item B<-m> I<e-mail>, B<--mail> I<e-mail>

A destination I<e-mail> address for generated reports.

=item B<-h>, B<--html>

A flag indicated whether to send an HTML report (if set) or a plain
text report (if not set). Reports saved in a specified web directory
will be formatted as HTML regardless of this setting.

=item B<-n> I<text>, B<--name> I<text>

Specified text string overrides the name learned from the OS;
used for log names and such.

=back

=head1 EXAMPLES

For periodic automated runs, try something like this:

C<i-spy --web /var/www/i-spy --directory /var/sites>

The above may be set to run once a day via B<cron> and will generate
a report each time which one may later check from a client machine
with a web browser. All web reports provide a link to a report from
a previous run. Also, each report references its corresponding log file.

B<i-spy> will expect to find the B<site directories> within the
C</var/sites> directory specified above via the C<--directory>
directive.

B<i-spy> will save the report in the directory specified by the
C<--web /var/www/i-spy> directive and the log in the corresponding
C</var/www/i-spy/logs> directory. Both should be accessible via
a web server or a locally running web browser.


For interactive reports delivered via e-mail, consider the following:

C<i-spy --mail user@domain.tld>

B<i-spy> will check the C<sites> directory in your present working
directory for B<site directories>. It will generate a plain text report
and send it to the specified C<user@domain.tld> e-mail address.

B<i-spy> will also leave a log in the C<logs> subdirectory of the present
working directory.

For HTML-formatted reports over e-mail, try:

C<i-spy --mail user@domain.tld --html>


=head1 FILES

=over 4

=item data.txt

A text file within individual site directories containing the primary URL
on the first line and either an alternate URL on line two or an FTP listing
type directive on line two.

=item added.txt

A text file within individual site directories containing all items added
since the preceding run.

=item removed.txt

A text file within individual site directories containing all items removed
since the preceding run.

=item retained.txt

A text file within individual site directories containing all items which
have remained unchanged since the preceding run.

=item snapshot.txt

A text file within individual site directories containing the current
snapshot of the remote resource.

=item old-snapshot.txt

A text file within individual site directories containing the snapshot
of the remote resource saved during a preceding run.

=item pre-filter

An executable to process the scraped snapshot before comparison to the
F<old-snapshot>.

=item post-filter

An executable to process the scraped snapshot after comparison to the
F<old-snapshot>.

=item index.html

A symbolic link to the latest report saved in the B<web> directory.

=back

=head1 REQUIRES

Perl 5.8, Getopt::Long, Log::File, Date::Format

=head1 SEE ALSO

perl(1)

=head1 BUGS

Send bug reports, questions, and requests to i-spy@ayradyss.org.

=head1 AUTHOR

Igor S. Livshits <mailto:i-spy@ayradyss.org>

=head1 COPYRIGHT

Copyright (C) 2006 Igor S. Livshits

Use and distribute this tool as per the Artistic License

=cut