#!/usr/local/bin/perl -Tw # parser.pl v 1.5.12 # by wayne.myers@bbc.co.uk (et al) # Enormous thanks to: # Chay Palton, Damion Yates, George Auckland, # Dan Tagg, T.V. Raman, Mark Foster, Peter Burden, # Jack Evans, Steve at wels.net, Gene D., Maurice Walshe # Mark A. Rowe and many others. # parser.pl aka BETSIE is Copyright 1998 - 2001 BBC Digital Media # See LICENCE for full licence details # See README for more information # See ChangeLog for version changes # modules use Socket; use strict; # variables my $version = "1.5.12"; # version number my @x = (); # all the lines of the html page we're parsing my $contents = ""; # @x concatenated my $inpath = ""; # path_info string from which we get the rest my $qs; # query string, used for authorisation process my $root = ""; # domain of the page we are looking at my $path = ""; # path of the page we are looking at my $file = ""; # name of the file we are looking at my $postdata = ""; # POST method data my $method = "GET"; # $method remains this way unless we get POST data my $length = -1; # but is the length of content if any greater in a POST my $count; # counter for main request loop my $httptype; # http type of request my $code; # http return code my $msg; # http message my $newurl; # used to store redirect target my $tag; # used to store meta redirect tags my $loop_flag; # flag used to make sure we get the right page my $script_flag; # flag used to see if we are in script tags or not my $ws_flag; # flag used to minimise unnecessary white space my $set = 0; # is 1 if we want the settings page my $body; # the body tag my $cookies; # cookies we want to pass on to other server my @setcookies; # cookies server wants us to pass onto user my $header; # the http header we want to send the browser my $extraheaders; # extra headers to send the server on requests my $nocontenttype = 1; # flag unset when content type is printed my $basic_realm; # contains strin my $set_auth = 0; # flag set when required authorisation is provided # VARIABLES YOU MIGHT WANT TO CHANGE: my $pathtoparser = "http://$ENV{'SERVER_NAME'}$ENV{'SCRIPT_NAME'}"; my $selfnuke = "(?:$pathtoparser|$ENV{'SCRIPT_NAME'})"; # used to eliminate textonly links to self my $maxpost = 65536; # is maximum number of bytes in a POST my $parsehome = "http://www.bbc.co.uk/education/betsie/"; my $name = $ENV{'SCRIPT_NAME'}; # name of this file $name =~ s/^.*\/(\w\.+)$/$1/; my $agent = $ENV{'HTTP_USER_AGENT'}; # pretend to be the calling browser my $allowchars = '[a-zA-Z0-9_.\-\/\#\?\&\=\%\~\+\:]'; # allowed characters my $alarm = 20; # number of seconds before we time out # variables for colour/font settings etc # be sure to amend make_body() if you amend them my $setstr = "/0005"; # is default string for settings. my $chsetstr = "/1005"; # string used for default change settings page # next five arrays are for each set of colour options. feel free to add to or amend these. my @bg = ('#000000', '#FFFFFF', '#0000FF', '#FFFFCC'); my @text = ('#FFFF00', '#000000', '#FFFFFF', '#000000'); my @link = ('#00FFFF', '#0000FF', '#FFFFCC', '#0000FF'); my @vlink = ('#00CCFF', '#0000CC', '#FFFF99', '#0000CC'); my @alink = ('#FFFF00', '#000000', '#FFFFFF', '#000000'); # ten fonts. again, you can change these if you like my @font_face = ("Verdana, Arial", "Times", "Courier", "Helvetica", "Arial", "Bookman Old Style", "Geneva", "Chicago", "Courier New", "System"); # VARIABLES YOU MUST SUPPLY: my $localhost = ""; #"www.bbc.co.uk"; # name of the actual machine which is localhost # (not necessarily same as server name if its virtual) my $parsecontact = "wayne.myers\@bbc.co.uk"; my @safe = qw ( bbc.co.uk beeb.com bbcworldwide.com bbcresources.com bbcshop.com radiotimes.com open.ac.uk open2.net freebeeb.net ); # VARIABLES YOU (PROBABLY) DON'T WANT TO TOUCH: my ($rec_test) = $pathtoparser =~ /^http:\/(.*)$/; # var to solve recursion problem # Set alarm handler (comment this out on systems that can't handle it) alarm $alarm; $SIG{ALRM} = \&alarm; # main loop $|=1; # handle POST requests if ($ENV{'REQUEST_METHOD'} eq "POST") { $length = $ENV{'CONTENT_LENGTH'}; if ($length > $maxpost) { $x[0] = "Too much data for POST method."; error(); exit; } if ($length) { read(STDIN, $postdata, $length, 0); $method = "POST"; } } # take path info or referer allowing easy linking in... $inpath = $ENV{'PATH_INFO'} || $ENV{'HTTP_REFERER'}; # strip http/ftp/gopher etc scheme if present (ie came from referer) $inpath =~ s/^\w+:\///; # Uncomment the following ugly hack for servers that don't do PATH_INFO properly # (if you couldn't do alarm above, you probably need this too :( ) # $inpath =~ s/^.*?$name//; # get query string $qs = $ENV{'QUERY_STRING'} || ""; if (($inpath !~ /^$allowchars+$/) || ($inpath =~ /\.\./) || ($qs !~ /^$allowchars*$/) || ($qs =~ /\.\./)) { $x[0] = "Unknown error"; error(); exit 0; } # beat recursive betsie bug #1 $inpath =~ s/^$rec_test//i; # get optional settings string $inpath =~ s!^(\/\d{4})\/!\/!; $setstr = $1 || $setstr; # is already initialised to '/0005' if (length $setstr != 5) { $setstr = "/0005"; } $chsetstr = $setstr; $chsetstr =~ s/^\/(\d)/\/1/; if ($1 eq "1") { $set = 1;}; ($root, $path, $file) = urlcalc($inpath); unless (safe("http:\/\/$root")) { $x[0] = "http:\/\/$root$path$file<\/a> not on safe list. Sorry"; error(); exit; } $cookies = $ENV{'HTTP_COOKIE'} || $ENV{'COOKIE'} || ""; # turn any Betsie-auth cookie into an Authorization header if ($cookies =~ s/Betsie-auth=([A-Za-z0-9\+\=\/]*?)\;//s) { $extraheaders = "Authorization: Basic $1\n"; $set_auth = 1; } # if query string contains our betsie-pi parameter # this is an attempt to authorise ourselves on a page # if it isn't really, the following will simply fail... if($postdata =~ /betsie-pi=/) { # set a cookie with the auth string my ($authcook, $authloc) = make_auth_cookie_and_loc($file); # redirect to ourselves with the proper url print "Set-Cookie: Betsie-auth=$authcook;\n"; print "Content-Type: text/html\n\n"; print <

Logging in. Please follow this link. HTML exit 0; } $loop_flag = 0; $count = 0; LOOP: while ($loop_flag == 0) { $count++; if ($count == 9) { $x[0] = "Too many times through the loop."; error(); exit; } if ($qs ne "") { $file .= "\?$qs" } @x = graburl($root, $path . $file); $contents=join '', @x; # handle http codes # 3xx we follow the redirect # anything other than 200 is an error. ($httptype, $code, $msg) = split /\s+/, $x[0]; if ($code =~ /^3\d\d/) { $contents =~ s/^.*Location:\s+(\S+)\s.*$/$1/s; $newurl = $contents; redir(); next LOOP; } if ($code !~ /200|401/) { error(); exit; } # check for autoredirects of all sorts if ($contents =~ /(]*?http-equiv[^>]*?refresh[^>]*?>)/is) { $tag = $1; unless ($tag =~ /content=\"\d{3,}/is) { # only deal with refreshes of 99 secs or less if ($tag =~ /url=(.*?)\"/is) { # don't refresh if no url given $newurl = $1; # unless ($file =~ /$newurl$/) { # don't refresh to same page unless (($file =~ /$newurl$/) || ($newurl =~ /$path$file$/)) { # don't refresh to same page redir(); next LOOP; } } } } # if we got here we must have got something and can end the loop $loop_flag = 1; } # lose HTTP OK line if ($code =~ /200|401/) { $contents =~ s/^HTTP[^\n]*\n//s; } # ignore all files not reported as some kind of text in content-type if ($contents !~ /Content-Type:\s+text/is) { # send contents on unchanged print $contents; exit 0; } # pass cookies on but rewrite path while ($contents =~ s/(Set-Cookie:[^\n]*\n)//isg) { my $c = $1; if ($c =~ /path=/) { # arbitrarily rewrite path to '/' so both betsie and originator can see it $c =~ s/(path=\/)[^;]*;/$1;/is; } else { $c =~ s/\n$/ path=\/;\n/; } print $c; } # get header out of contents # finding it hard to match \n\n for some reason $contents =~ s/^(.*?\n\s*\n)//s; $header = $1; $header =~ s/Content-Length.*\n//i; # because the length is now wrong # add content type to header if not present if ($header !~ /Content-Type/s) { $header =~ s/\n\n$/\nContent-Type: text\/html\n\n/s; } # handle WWW-Authenticate requests if (($set_auth == 0) && ($header =~ /WWW-Authenticate:([^\n]*\n)/isg)) { # ($basic_realm) = $header =~ /Basic realm=\"([^\"]*?)\"/is; $contents = make_auth_page(); print $header, $contents; exit 0; } # *now* print header content type print $header; # set nocontenttype flag to ensure it never gets printed again $nocontenttype = 0; # make the body tag $body = make_body(); # if we're on the settings page, send that and exit if ($set != 0) { print < Betsie Settings Page $body HTML exit; } # we're not on the settings page. call parser routines $contents = preparse($contents); @x = split /\n/, $contents; $contents = ""; # start sending $script_flag = 0; # it will be 1 if we are in