#!/usr/local/bin/perl # parser.pl v 1.5 - 16th December, 1999 # by wayne.myers@bbc.co.uk (et al) # Enormous thanks to: # Chay Palton, Damion Yates, Matt Blakemore, Dan Tagg, T.V. Raman, Mark Foster # and many others. # parser.pl aka BETSIE is Copyright 1998,1999 BBC Digital Media # See licence.txt for full licence details # See readme.txt for more information # These documents and more are available on the Betsie website: # http://www.bbc.co.uk/education/betsie/ # Changes: 1.5 # 1 - Made move_nav a seperate subroutine # 2 - Fixed anchors bug in auto-detected referers # 3 - Cleaned up code some. # 4 - Added some notion of changeable settings # modules use Socket; #use strict; # variables my $VERSION = "1.5"; # version number my @x = (); # this array holds all the lines of the html page we're parsing my $contents = ""; # this string is @x concatenated my $inpath = ""; # this is the path_info string from which we get the rest my $root = ""; # this is the domain of the page we are looking at my $path = ""; # this is the path of the page we are looking at my $file = ""; # this is the name of the file we are looking at my $postdata = ""; # this is any POST method data my $method = "GET"; # and remains this way unless we get POST data my $length = -1; # but is the length of content if any greater in a POST my $count; # counter for main request loop my $httptype; # http type of request my $code; # http return code my $msg; # http message my $newurl; # used to store redirect target my $tag; # used to store meta redirect tags my $loop_flag; # flag used to make sure we get the right page my $script_flag; # flag used to see if we are in script tags or not my $ws_flag; # flag used to minimise unnecessary white space my $set = 0; # is 1 if we want the settings page my $body; # holds the body tag # VARIABLES YOU MIGHT WANT TO CHANGE: my $maxpost = 65536; # is maximum number of bytes in a POST my $parsehome = "http://www.bbc.co.uk/education/betsie/"; my $name = "betsie-1.5.pl"; # name of this file my $agent = $ENV{'HTTP_USER_AGENT'}; # pretend to be the calling browser # variables for colour/font settings etc # be sure to amend make_body() if you amend them my $setstr = "/0005"; # is default string for settings. my $chsetstr = "/1005"; # string used for default change settings page # next five arrays are for each set of colour options. feel free to add to or amend these. my @bg = qw(#000000 #FFFFFF #0000FF #FFFFCC); my @text = qw(#FFFF00 #000000 #FFFFFF #000000); my @link = qw(#00FFFF #0000FF #FFFFCC #0000FF); my @vlink = qw(#00CCFF #0000CC #FFFF99 #0000CC); my @alink = qw(#FFFF00 #000000 #FFFFFF #000000); # ten fonts. again, you can change these if you like my @font_face = ("Verdana, Arial", "Times", "Courier", "Helvetica", "Arial", "Bookman Old Style", "Geneva", "Chicago", "Courier New", "System"); # VARIABLES YOU MUST SUPPLY: my $pathtoparser = "http://$ENV{'SERVER_NAME'}/cgi-bin/education/betsie/$name"; #my $pathtoparser = "http://localhost/cgi-bin/betsie/$name"; my $parsecontact = "education.online\@bbc.co.uk"; my $localhost = "www.bbc.co.uk"; my @safe = qw ( bbc.co.uk beeb.com bbcworldwide.com bbcresources.com bbcshop.com radiotimes.com open.ac.uk open2.net freebeeb.net ); # VARIABLES YOU (PROBABLY) DON'T WANT TO TOUCH: my ($rec_test) = $pathtoparser =~ /^http:\/(.*)$/; # var to solve recursion problem # Set alarm handler (comment this out on systems that can't handle it) alarm 10; $SIG{ALRM} = \&alarm; # main loop $|=1; print "Content-type: text/html\n\n"; # handle POST requests if ($ENV{'REQUEST_METHOD'} eq "POST") { $length = $ENV{'CONTENT_LENGTH'}; if ($length > 65536) { $x[0] = "Too much data for POST method."; error(); exit; } if ($length) { read(STDIN, $postdata, $length, 0); $method = "POST"; } } # take path info or referer allowing easy linking in... $inpath = $ENV{'PATH_INFO'} || $ENV{'HTTP_REFERER'}; # strip http/ftp/gopher etc scheme if present (ie came from referer) $inpath =~ s/^\w+:\///; # Uncomment the following ugly hack for servers that don't do PATH_INFO properly # (if you couldn't do alarm above, you probably need this too :( ) # $inpath =~ s/^.*?$name//; unless ($inpath =~ /^[a-zA-Z0-9_.\-\/\#]+$/) { $x[0] = "Unknown error"; error(); exit; } # beat recursive betsie bug $inpath =~ s/^$rec_test//i; # get optional settings string $inpath =~ s#(\/\d+)\/#\/#; $setstr = $1 || "/0005"; if (length $setstr != 5) { $setstr = "/0005"; } $chsetstr = $setstr; $chsetstr =~ s/^\/(\d)/\/1/; if ($1 eq "1") { $set = 1;}; ($root, $path, $file) = urlcalc($inpath); unless (safe("http:\/\/$root")) { $x[0] = "http:\/\/$root$path$file<\/A> not on safe list. Sorry"; error(); exit; } $loop_flag = 0; $count = 0; LOOP: while ($loop_flag == 0) { $count++; if ($count == 9) { $x[0] = "Too many times through the loop."; error(); exit; } if ($ENV{'QUERY_STRING'} ne "") { $file .= "\?$ENV{'QUERY_STRING'}" } @x = graburl($root, $path . $file); $contents=""; foreach (@x) { $contents .= $_; } # handle http codes # 3xx we follow the redirect # anything other than 200 is an error. ($httptype, $code, $msg) = split /\s+/, $x[0]; if ($code =~ /^3\d\d/) { $contents =~ s/^.*Location:\s+(\S+)\s.*$/$1/s; $newurl = $contents; redir(); next LOOP; } if ($code !~ /200/) { error(); exit; } # check for autoredirects of all sorts if ($contents =~ /(]*?HTTP-EQUIV[^>]*?REFRESH[^>]*?>)/is) { $tag = $1; unless ($tag =~ /content=\"\d{3,}/is) { # only deal with refreshes of 99 secs or less if ($tag =~ /url=(.*?)\"/is) { # don't refresh if no url given $newurl = $1; unless ($file =~ /$newurl$/) { # don't refresh to same page redir(); next LOOP; } } } } # if we got here we must have got something and can end the loop $loop_flag = 1; } # make the body tag $body = make_body(); # if we're on the settings page, send that and exit if ($set != 0) { print < Betsie Settings Page $body HTML exit; } # we're not on the settings page. call parser routines $contents = preparse($contents); @x = split /\n/, $contents; $contents = ""; # start sending $script_flag = 0; # it will be 1 if we are in