#!/usr/bin/perl

# This is a program that recursively generates Captrap's pages and graphs.

# Copyright 2009 Corey Hickey


# This file is part of Captrap.
#
# Captrap is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Captrap is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Captrap.  If not, see <http://www.gnu.org/licenses/>.


=head1 NAME

captrap_recurse - a command-line program for recursively generating Captrap
pages and graphs

=head1 SYNOPSIS

captrap_recurse [OPTION] [OPTION-PARAMETERS]

=head1 DESCRIPTION

This program will recursively generate all of Captrap's pages. There is also a
"cron mode" suited for running periodically as a cron job.

=head1 OPTIONS

=over

=item -help

Print brief usage text.

=item -list-parameters

Print a list of valid parameters.

=item -write FILE PARAMETERS

Write a graph to a file, unless the file already exists. To write to stdout,
specify the file as '-' (if you really want to create a file named '-', specify
'./-'). The filename must be followed by a list of parameters in the usual URL
form (as in param1=value1&param2=value2). Be sure to surround the parameters
with single-quotes if necessary, to avoid having the shell interpret the
ampersands.

=item -v-write DIRECTORY PARAMETERS

Same as "-write", but verbose.

=item -validate DIRECTORY PARAMETERS

Same as "-v-write", but run the HTML pages through a markup validator. This
uses the same backend as the W3C validator, but doesn't have the nice error
parsing. This option is included mainly for development and testing. The
validation depends on the onsgmls program and the W3C xml.soc file. Debian
users should install the opensp and w3c-markup-validator packages; there's no
need to configure a webserver to run the validator. If onsgmls and/or xml.soc
are in locations other than where captrap_recurse expects them, set the
ONSGMLS_PATH and/or CATALOG_PATH environment variables to the corresponding
paths.

=item -cron-write BASE LINK ROUNDOFF

This is a wrapper for the "-write" action intended to be run as a cron job. The
first argument specifies a base directory; the current time will be appended to
BASE, and files will be written there. Next, a symlink to the new directory
will be created (possibly replacing an old link) as specified by LINK. The
directory to which LINK previously pointed will be removed; some checks are in
place to make sure the old directory contains an old set of Captrap files, but,
to be safe, make sure the directory which contains the link is not writable by
normal users--otherwise a malicious user may make the link point somewhere else
and cause files to be removed. The final parameter to this action, ROUNDOFF,
specifies the unit by which the curent time is rounded down; for example, if
ROUNDOFF is 'hour', pages will be generated as if this script were being run
exactly at the beginning of the current hour.

=back

=head1 EXIT STATUS

=over

=item 0Z<>

Everything is ok.

=item 1Z<>

No arguments given; usage information was shown.

=item 2Z<>

Invalid argument.

=item 3Z<>

There was a problem executing an action.

=back

=head1 FILES

=over

=item /etc/captrap/captrap.conf

The main Captrap configuration file.

=back

=head1 EXAMPLES

=over

=item Verbosely write the pages to a directory:

captrap_recurse -v-write page_dir ""

=item Write pages for a specific time to a directory:

captrap_recurse -write page_dir "now=2008-08-22T08:00:00"

=item Sample hourly crontab entry (should be placed in a user's crontab file):

PATH = /usr/local/bin:/usr/bin/:/bin
# m h  dom mon dow   command
5 * * * * captrap_recurse -cron-write /www/captrap/ /www/captrap/static hour

=back

=head1 AUTHOR

Corey Hickey <bugfood-c@fatooh.org>

This program is free software; you may redistribute and/or modify it under the
terms of the GNU General Public License, version 3. See the source file for the
usual GPL preamble and the COPYING file for a copy of the GPL.

=head1 SEE ALSO

captrap_graph, captrap_view, captrap_main, crontab

The documentation included with the Captrap source code has more information on
setup and general usage.

=cut

use 5.010; # we need Perl >= 5.10 for // operator
use strict;
use warnings FATAL => 'all';

use File::Copy;
use File::Basename;
use Fcntl qw(:mode);

# for development using a different Captrap module
use lib "lib";
use Captrap qw(:cgi :misc :actions :args :config :db);
use Captrap::Main qw(:mainpage);
use Captrap::View qw(:view);
use Captrap::Graph qw(:graph);


# -----------------------------------------------------------------------------
# printing
# -----------------------------------------------------------------------------

# print main help info
sub usage {
  my $common = shift; # unused
  my $actions = mk_actions();
  my $actions_text = describe_actions($actions);
  print "
This is a script for recursively generating Captrap's pages and graphs. For
full usage information, see the man page and/or documentation provided in the
Captrap source archive.

captrap_recurse [ACTION] [[ACTION-PARAMETERS]] ...

ACTIONS

$actions_text
"
}


# list all parameters
sub list_params {
  my $common = shift;
  my $param_info = Captrap::Main::mk_param_info($common->{config});
  arg_list_params($param_info);
}

# -----------------------------------------------------------------------------
# recursive page writing
# -----------------------------------------------------------------------------

# make recur hash
sub mk_recur {
  my $config = shift;
  return {
    v => 0,
    validate => undef,
    dir => undef,
    num => 0,
    tree => {},
    progs => {
      "main.pl" => {
        info => Captrap::Main::mk_param_info(),
        func => \&Captrap::Main::mk_mainpage,
      },
      "viewer.pl" => {
        info => Captrap::View::mk_param_info(),
        func => \&Captrap::View::mk_views,
      },
      "grapher.pl" => {
        info => Captrap::Graph::mk_param_info($config),
        func => \&Captrap::Graph::mk_graph,
      },
    },
  };
}


# recursively write files to directory
sub do_recur {
  my $common = shift;
  my $dir = shift;
  my $params = shift;
  unless (-d "$dir") {
    print STDERR "target directory \"$dir\" does not exist\n";
    return 1;
  }
  # Copy the stylesheet.
  # The following line is tranlated when 'make' is run; don't alter it without
  # examining util/trans.pl.
  my $cgidir = 'cgi-bin'; # translate me
  # don't allow this to be a path--just the file name
  my $stylesheet = (split('/', $common->{config}->{stylesheet}))[-1];
  if (-e "$dir/$stylesheet") {
    print STDERR
        "stylesheet $stylesheet already exists; not overwriting files\n";
    exit 3;
  }
  $stylesheet = "$cgidir/$stylesheet";
  if (copy($stylesheet, $dir)) {
    if ($common->{recur}->{v}) {
      print STDERR "copied stylesheet: $stylesheet\n";
    }
  } else {
    print STDERR "can't copy stylesheet \"$stylesheet\": $!\n";
    exit 3;
  }
  # Generate the files.
  $common->{recur}->{dir} = $dir;
  my $param_info = Captrap::Main::mk_param_info($common->{config});
  my $file = "$dir/index.html";
  arg_handle_params($common, $param_info, \&arg_mk_all, $params, $file);
  # If we get here, it worked; otherwise, we've already exited.
  # So, return 0 to signify an action that ran ok.
  return 0;
}


# recursively write files to directory (verbose wrapper)
sub do_recur_v {
  my $common = shift;
  my $dir = shift;
  my $params = shift;
  $common->{recur}->{v} = 1;
  return do_recur($common, $dir, $params);
}

# recursively write files to directory (validation wrapper)
sub do_recur_validate {
  my $common = shift;
  my $dir = shift;
  my $params = shift;
  $common->{recur}->{v} = 1;
  $common->{recur}->{validate} = \&validate;
  return do_recur($common, $dir, $params);
}

# use provided cgi parameters to make all files
sub arg_mk_all {
  my $common = shift; # hash ref
  my $params = shift; # hash ref
  my $file = shift;
  my $prog = $common->{config}->{mainpage};
  my $param_info = Captrap::Main::mk_param_info($common->{config});
  $params = params_input_to_output($param_info, $params);
  my $mainpage = mk_link($common, $prog, $params);
  unless (symlink($mainpage, $file)) {
    print STDERR "can't create symlink '$file': $!\n";
    exit 3;
  }
}

# -----------------------------------------------------------------------------
# cron mode
# -----------------------------------------------------------------------------

# wrapper for do_recur
sub do_recur_cron {
  my $common = shift;
  my $dir = shift;
  my $link = shift;
  my $unit = shift;
  my $linkdir = dirname($link);
  unless (check_dir($linkdir)) {
    return 3;
  }
  unless (check_dir($dir)) {
    return 3;
  }
  my $olddir = get_olddir($link); # may return undef
  my $times = get_times($common, undef);
  my $now = floor_unit($unit, $times->{now});
  my $newdir = "$dir/$now";
  unless (mkdir($newdir)) {
    print STDERR "can't make new directory '$newdir'\n";
    return 3;
  }
  if (do_recur($common, $newdir, "now=$now")) {
    print STDERR "recursive fetch failed\n";
    return 3;
  }
  if (-e "$link.tmp" && ! unlink("$link.tmp")) {
    print STDERR "can't unlink stale temporary symlink '$link.tmp'\n";
    return 3;
  }
  unless (symlink($newdir, "$link.tmp")) {
    print STDERR "can't create temporary symlink '$link.tmp'\n";
    return 3;
  }
  # should be atomic
  unless (rename("$link.tmp", $link)) {
    print STDERR "can't rename temporary symlink to '$link'\n";
    return 3;
  }
  # do we need to remove old files?
  return 0 unless defined($olddir);
  unless (rm_olddir($olddir)) {
    return 3;
  }
  return 0;
}


# remove an old Captrap download directory
sub rm_olddir {
  my $dir = shift;
  unless (opendir(DIR, $dir)) {
    print STDERR "can't open dir '$dir'\n";
    return 0;
  }
  while (my $file = readdir(DIR)) {
    next if $file =~ /^\.{1,2}$/; # skip . and ..
    $file = "$dir/$file";
    return 0 unless (file_ok($file));
    unless (unlink($file)) {
      print STDERR "can't remove file: '$file'\n";
      return 0;
    }
  }
  # should be empty by now
  unless (rmdir($dir)) {
    print STDERR "can't remove old dir '$dir'\n";
    return 0;
  }
  return 1;
}


# check if file looks like it was generated by Captrap
sub file_ok {
  my $file = shift;
  my $mode = (lstat($file))[2];
  unless(defined($mode)) {
    print STDERR "lstat failed for '$file':$!\n";
    return 0;
  }
  unless (S_ISREG($mode) ||
      S_ISLNK($mode) && basename($file) eq 'index.html') {
    print STDERR "not a regular file: '$file'\n";
    return 0;
  }
  unless ($file =~ /\.(html|png|svg|txt|csv|css)$/) {
    print STDERR "unrecognized file: '$file'\n";
    return 0;
  }
  return 1;
}


# check if directory exists
sub check_dir {
  my $dir = shift;
  unless (-e $dir) {
    print STDERR "directory '$dir' does not exist.\n";
    return 0;
  }
  unless (-d $dir && ! -l $dir) {
    print STDERR "'$dir' is not a directory\n";
    return 0;
  }
  return check_dir_perms($dir);
}


# check permissions/ownership of specified directory
sub check_dir_perms {
  my $dir = shift;
  my ($mode, $uid) = (stat($dir))[2, 4];
  if ($uid ne $>) {
    print STDERR "I don't own directory '$dir'\n";
    return 0;
  }
  if ($mode & 00002) {
    print STDERR "Other users can write to directory '$dir'\n";
    return 0;
  }
  return 1;
}


# look at the symlink to the old directory and check if it's ok
sub get_olddir {
  my $link = shift;
  unless (-e $link) {
    return undef;
  }
  unless (-l $link) {
    print STDERR "Warning: file '$link' is not a symlink.\n";
    return undef;
  }
  unless (-d $link) {
    print STDERR "Warning: symlink '$link' does not point to a directory.\n";
    return undef;
  }
  my $target = readlink($link);
  my $olddir;
  if (substr($target, 0, 1) eq '/') {
    # absolute link
    $olddir = $target;
  } else {
    # relative link
    $olddir = dirname($link) . "/$target";
  }
  # now see if the contents of $olddir look like a Captrap download
  local *DIR;
  unless (opendir(DIR, $olddir)) {
    print STDERR "Warning: can't open dir '$olddir'\n";
    return undef;
  }
  while (my $file = readdir(DIR)) {
    next if $file =~ /^\.{1,2}$/; # skip . and ..
    $file = "$olddir/$file";
    next if file_ok($file);
    print STDERR "Unrecognized file in old directory: '$file'\n";
    return undef;
  }
  return $olddir; # looks ok
}

# -----------------------------------------------------------------------------
# validation
# -----------------------------------------------------------------------------

# build up an array of arguments to pass to onsgmls
sub mk_onsgmls_cmd {
  my $onsgmls = shift;
  my $catalog = shift;
  # from w3c validator; don't remove these:
  my @w3c = ("-c$catalog", qw(
    -n
    -E0
    -wvalid
    -wnon-sgml-char-ref
    -wno-duplicate
    -wxml
  ));
  # so far these don't seem to hurt:
  my @extra = qw(
    -wall
    -wimmediate-recursion
    -wfully-declared
    -wfully-tagged
    -wamply-tagged
    -wamply-tagged-recursive
    -wtype-valid
    -wintegral
  );
  # to add, maybe, if I can make the pages pass:
  my @fail = qw(
    -wnet
  );
  return ($onsgmls, @w3c, @extra);
}


# run data through onsgmls
sub validate {
  my $data = shift;
  my $onsgmls = $ENV{ONSGMLS_PATH} // '/usr/bin/onsgmls';
  my $catalog = $ENV{CATALOG_PATH} //
      "/usr/share/w3c-markup-validator/catalog/xml.soc";
  my @cmd = mk_onsgmls_cmd($onsgmls, $catalog);
  local (*C_OUT, *C_IN, *P_OUT, *P_IN);
  pipe(C_IN, P_OUT); # parent --> child
  pipe(P_IN, C_OUT); # child  --> parent
  my $childpid = fork;
  die "couldn't fork: $!" unless defined($childpid);
  if (! $childpid) {
    # I am the child
    close(P_OUT);
    close(P_IN);
    open(STDERR, ">&=C_OUT") or die "child: could not reopen STDERR: $!";
    open(STDIN,  "<&=C_IN")  or die "child: could not reopen STDIN: $!";
    close(STDOUT) or die "child: could not close STDOUT: $!";
    $ENV{PATH} = ""; # for taint check
    exec({$cmd[0]} @cmd) or die "child: could not exec $onsgmls: $!";
  }
  # I am the parent
  close(C_OUT);
  close(C_IN);
  #open(P_OUT, "> /tmp/debug");
  print P_OUT $data or die "can't print data to child: $!";
  close(P_OUT) or die "close P_OUT failed: $!";
  my $errors;
  my $max_len = 2**16; # 4 KB should be enough
  my $bytes_read = read(P_IN, $errors, $max_len);
  die "too many bytes returned from $onsgmls" if $bytes_read >= $max_len;
  close(P_IN) or die "close P_IN failed: $!";
  # don't want zombies
  waitpid($childpid, 0);
  my ($status, $signal) = ($? >> 8, $? & 127);
  if ($status) {
    print STDERR "$onsgmls failed. exit status $status, signal $signal\n";
    print STDERR "full command: @cmd\n";
    print STDERR "error output:\n", '-' x 80, "\n$errors\n", '-' x 80, "\n";
    return 0;
  }
  return 1;
}

# -----------------------------------------------------------------------------
# actions info
# -----------------------------------------------------------------------------

# return a hash of action info
sub mk_actions {
  my $actions = mk_ixhash();
  %$actions = (
    "-help" => {
      func => \&usage,
      args => [],
      desc => "
          Print this usage text.
      ",
    },
    "-list-parameters" => {
      func => \&list_params,
      args => [],
      desc => "
          Print a list of valid parameters.
      ",
    },
    "-write" => {
      func => \&do_recur,
      args => [ qw(DIRECTORY PARAMETERS) ],
      desc => "
          Write the pages to a directory, as long as the directory already
          exists.
      ",
    },
    "-v-write" => {
      func => \&do_recur_v,
      args => [ qw(DIRECTORY PARAMETERS) ],
      desc => "
          Same as \"-write\", but verbose.
      ",
    },
    "-validate" => {
      func => \&do_recur_validate,
      args => [ qw(DIRECTORY PARAMETERS) ],
      desc => "
          Same as \"-v-write\", but run pages through a markup validator.
      ",
    },
    "-cron-write" => {
      func => \&do_recur_cron,
      args => [ qw(BASE LINK ROUNDOFF) ],
      desc => "
          This is a wrapper for the \"-write\" action intended to be run as a
          cron job.
      ",
    },
  );
  return $actions;
}

# ----------------------------------------------------------------------------
# parse the arguments and take actions
if (! @ARGV) {
  usage();
  exit(1);
}
my $actions = mk_actions();
check_args(\@ARGV, $actions);

my $config = parse_config();
my $common = {
  cgi => mk_cgi(),
  config => $config,
  dbh    => mk_dbh($config),
  recur  => mk_recur($config),
};

do_args($common, \@ARGV, $actions);

$common->{dbh}->disconnect();
exit(0);