#! /usr/local/bin/perl # Written by Stephen Fenner # Started 9/17/98 (1.0) # 1.1 9/18/01 # # Copyright 2001 by Stephen A. Fenner. This software may be used and # modified freely, provided the above comments and are preserved. All other # rights are reserved. Please let me know of any enhancements or bugfixes # you may have by email to fenner@cse.sc.edu. Thanks. # # Script for converting handouts in a pseudo-LaTeX format into html # Constants $program = "p2h"; $version = 1.1; $defaultVerbatimEnd = "\nEOV\n"; # Useful regexps # Stuff from an opening to a closing curly brace, with no curly braces # in between. $field = '(?:\{[^{}]*\})'; # A backslash followed by an identifier, followed by an optional # argument (in square brackets), followed by zero or more fields. $command = "(?:\\\\(\\w+)(?!\\w)((?!\\[)|\\[[^\\]]*\\])((?!\{)|$field+))"; # A opening square bracket (unescaped) followed by text until the next # unescaped closing square bracket $tagArgs = '(^\[|[^\[]\[)((?:[^\\\\\]]*\\\\[^\]])*[^\\\\\]]*)\]'; # A whole line containing a verbatim command, which must start the line, # followed by an optional argument, followed by the rest of the line, # which must consist of zero of more decimal digits. $verbatim = '(?:(?:^|\n)\\\\(?:verbatim|VERBATIM)(\[([^\]]+)\])?(\d*)\n)'; # Characters to be treated specially. $specialChars = '[\ $\t\n{}\[\]~\\\\_^"|]'; # Stuff running from "$$" to "$$", with no "$" (but at least one # character) in between. $displayMathenv = '\$(\$[^$]+\$)\$'; # Stuff running from "$" to "$" with no "$" (but at least one character) # in between. $mathenv = '\$([^$]+)\$'; # Single non-identifier characters used as command names without a # preceding backslash, e.g., subscript and superscript. (The next # character is also grabbed so if it is not an opening curly brace, # it can serve as the one-character argument, e.g., "x^2".) $commandChars = '([_^])(.)'; # Single or double vertical bars that start table entries or rows, # respectively. (A row starter also starts the first entry in the # row.) $tableMark = '(?:\|((?:\[[^\]]*\])?))'; # The following is a list of protected tag arguments @tagArgList = (); # the following translations are done when the given character is # preceded by a backslash. %char2entity = ( ' ' => ' ', '$' => '$', "\t" => ' ', "\n" => "\\br\n", '{' => '{', '}' => '}', '[' => '[', ']' => ']', '~' => '~', '\\' => '\', '_' => '_', '^' => '^', '&' => '&', '<' => '<', '>' => '>', # '"' => '"', '"' => '"', '|' => '|', ); # these characters translate to commands with no preceding backslash. %char2command = ( '_' => 'sub', '^' => 'sup', ); # Any alphabetic commands not on this list are assume to be HTML tag # names and convert to HTML tags with arguments given in square brackets. # Names are chosen to correspond with LaTeX as much as I am familiar. # Many of these entities (especially for HTML 4.x) were found at # http://www.alanwood.net/demos/ent4_frame.html. # Each key in the table below is associated with a function that # reads the arguments of the command (if any) and outputs the text # to replace the command. (This text may be processed further in # a subsequent pass.) %commands = ( # def => \&def, p => \&p, tabular => \&tabular, entity => \&entity, today => \&today, now => \&now, comment => \&comment, # assignment => \&assignment, # course => \&course, # due => \&due, # section => \§ion, # subsection => \&subsection, # list => \&list, # examples => \&examples, # math => \&math, display => \&display, # leq => sub { return "\\u\{\<\}"; }, # geq => sub { return "\\u\{\>\}"; }, leq => sub { return "\\le"; }, geq => sub { return "\\ge"; }, neq => sub { return "\\ne"; }, log => sub { return "log"; }, lg => sub { return "lg"; }, det => sub { return "det"; }, div => sub { return "div"; }, mod => sub { return "mod"; }, gcd => sub { return "gcd"; }, lcm => sub { return "lcm"; }, trace => sub { return "tr"; }, domain => sub { return "dom"; }, range => sub { return "rng"; }, codomain => sub { return "cod"; }, # ldots => sub { return "..."; }, ldots => sub { return "&hellip"; }, # cdots => sub { return "..."; }, cdots => sub { return "\\cdot\\cdot\\cdot" }, floor => sub { return "floor"; }, ceiling => sub { return "ceiling"; }, min => sub { return "min"; }, max => sub { return "max"; }, # iff => sub { return "<==>"; }, iff => sub { return "\\cleftrightarrow"; }, # implies => sub { return "==>"; }, implies => sub { return "\\crightarrow"; }, # exists => sub { return "exists"; }, # forall => sub { return "for\\ all"; }, exists => sub { return "∃"; }, forall => sub { return "∀"; }, pr => sub { return "Pr"; }, mid => sub { return " : "; }, in => sub { return "∈"; }, notin => sub { return "∉"; }, # and => sub { return "and"; }, # or => sub { return "or"; }, and => sub { return "\\wedge"; }, or => sub { return "\\vee"; }, # Capital Greek letters. # Unlike TeX, our command names are not case sensitive, # so we can't just use "\Sigma" for example. So we prefix with "c" instead. calpha => sub { return "Α"; }, cbeta => sub { return "Β"; }, cgamma => sub { return "Γ"; }, cdelta => sub { return "Δ"; }, cepsilon => sub { return "Ε"; }, czeta => sub { return "Ζ"; }, ceta => sub { return "Η"; }, ctheta => sub { return "Θ"; }, ciota => sub { return "Ι"; }, ckappa => sub { return "Κ"; }, clambda => sub { return "Λ"; }, cmu => sub { return "Μ"; }, cnu => sub { return "Ν"; }, cxi => sub { return "Ξ"; }, comicron => sub { return "Ο"; }, cpi => sub { return "Π"; }, crho => sub { return "Ρ"; }, csigma => sub { return "Σ"; }, ctau => sub { return "Τ"; }, cupsilon => sub { return "Υ"; }, cphi => sub { return "Φ"; }, cchi => sub { return "Χ"; }, cpsi => sub { return "Ψ"; }, comega => sub { return "Ω"; }, # Small Greek letters alpha => sub { return "α"; }, beta => sub { return "β"; }, gamma => sub { return "γ"; }, delta => sub { return "δ"; }, epsilon => sub { return "ε"; }, zeta => sub { return "ζ"; }, eta => sub { return "η"; }, theta => sub { return "θ"; }, thetasym => sub { return "ϑ"; }, iota => sub { return "ι"; }, kappa => sub { return "κ"; }, lambda => sub { return "λ"; }, mu => sub { return "μ"; }, nu => sub { return "ν"; }, xi => sub { return "ξ"; }, omicron => sub { return "ο"; }, pi => sub { return "π"; }, rho => sub { return "ρ"; }, sigma => sub { return "σ"; }, sigmaf => sub { return "ς"; }, tau => sub { return "τ"; }, upsilon => sub { return "υ"; }, upsih => sub { return "ϒ"; }, phi => sub { return "φ"; }, varphi => sub { return "ϕ"; }, chi => sub { return "χ"; }, psi => sub { return "ψ"; }, omega => sub { return "ω"; }, piv => sub { return "ϖ"; }, # other math symbols from WGL4 HTML prime => sub { return "′"; }, doubleprime => sub { return "″"; }, neg => sub { return "¬"; }, pm => sub { return "±"; }, middot => sub { return "·"; }, # a little higher cdot => sub { return "⋅"; }, # a little lower half => sub { return "½"; }, quart => sub { return "¼"; }, threequart => sub { return "¾"; }, times => sub { return "×"; }, divide => sub { return "÷"; }, dagger => sub { return "†"; }, ddagger => sub { return "‡"; }, # double dagger permille => sub { return "‰"; }, # per thousandth fracslash => sub { return "⁄"; }, aleph => sub { return "ℵ"; }, partial => sub { return "∂"; }, emptyset => sub { return "∅"; }, dell => sub { return "∇"; }, ni => sub { return "∋"; }, # backwards \in sum => sub { return "∑"; }, prod => sub { return "∏"; }, minus => sub { return "−"; }, lowast => sub { return "∗"; }, # asterisk operator sqrt => sub { return "√"; }, propto => sub { return "∝"; }, infty => sub { return "∞"; }, wedge => sub { return "∧"; }, vee => sub { return "∨"; }, cap => sub { return "∩"; }, cup => sub { return "∪"; }, integral => sub { return "∫"; }, sim => sub { return "∼"; }, # tilde operator cong => sub { return "≅"; }, # tilde over equals approxeq => sub { return "≈"; }, # tilde over tilde neq => sub { return "≠"; }, equiv => sub { return "≡"; }, # three horizontal lines le => sub { return "≤"; }, ge => sub { return "≥"; }, ne => sub { return "≠"; }, subset => sub { return "⊂"; }, supset => sub { return "⊃"; }, notsubset => sub { return "⊄"; }, subseteq => sub { return "⊆"; }, supseteq => sub { return "⊇"; }, oplus => sub { return "⊕"; }, otimes => sub { return "⊗"; }, bot => sub { return "⊥"; }, # also means perpendicular reals => sub { return "ℜ"; }, imags => sub { return "ℑ"; }, weierp => sub { return "℘"; }, # Weierstrass script P lceil => sub { return "⌈"; }, rceil => sub { return "⌉"; }, lfloor => sub { return "⌊"; }, rfloor => sub { return "⌋"; }, langle => sub { return "⟨"; }, rangle => sub { return "⟩"; }, leftarrow => sub { return "←"; }, rightarrow => sub { return "→"; }, uparrow => sub { return "↑"; }, downarrow => sub { return "↓"; }, leftrightarrow => sub { return "↔"; }, cleftarrow => sub { return "⇐"; }, crightarrow => sub { return "⇒"; }, cuparrow => sub { return "⇑"; }, cdownarrow => sub { return "⇓"; }, cleftrightarrow=> sub { return "⇔"; }, # Other miscellaneous characters emdash => sub { return "—"; }, endash => sub { return "–"; }, cae => sub { return "Æ"; }, ae => sub { return "æ"; }, coe => sub { return "Œ"; }, oe => sub { return "œ"; }, ss => sub { return "ß"; }, euro => sub { return "€"; }, cent => sub { return "¢"; }, # cent sign pound => sub { return "£"; }, # British pound copyright => sub { return "©"; }, tm => sub { return "™"; }, # trademark reg => sub { return "®"; }, # registered trademark lozenge => sub { return "◊"; }, # skinny white diamond spades => sub { return "♠"; }, clubs => sub { return "♣"; }, hearts => sub { return "♥"; }, diamonds => sub { return "♦"; }, # glossary => \&glossary, # htmltag => \&htmltag, ); #sub main { my $filename; my $basename; my $verbose = ''; # print "$tagArgs\n"; # exit(0); # parse command line arguments while ( @ARGV ) { $filename = shift @ARGV; if ( $filename eq '-' ) { $filename = ''; # standard input to standard output } elsif ( $filename eq '-v' ) { $verbose = 1; next; } else { $filename .= '.ptex' if $filename !~ /\.ptex$/; $basename = $filename; $filename = " $filename"; $basename =~ s/\.ptex$//; if ( -e "$basename.html" ) { my $rc = system "mv", "$basename.html", "$basename.html.bak"; die "$program: backup failed (code = $rc), ($!)\n" if $rc >> 8; } open( OUT, "> $basename.html" ) or die "Cannot open $basename.html for writing ($!)\n"; } print STDERR "$program: processing$filename ... " if $verbose; # grab the entire source file at once my $src = `cat$filename`; chomp $src; $src = firstPass( $src ); # $cnt = 0; # while ( @tarArgList ) { # $tmp = shift @tarArgList; # print "$cnt: $tmp\n"; # $cnt++; # } # print "-----------------\n" # print "$#tagArgList\n"; # print $src; # exit(0); $src = secondPass( $src ); my $isError = check( $src ); if ( $isError ) { close( OUT ) if $filename; die "\nThere were errors.\n"; } if ( $filename ) { print OUT "$src\n"; close( OUT ); chmod 0644, "$basename.html"; } else { print "$src\n"; } print STDERR "done\n" if $verbose; } } # One pass through the entire source is used for EACH of the following tasks: # - Convert verbatim sections into quote sections # - Convert unescaped HTML metacharacters into literal characters # - Convert escaped special characters into HTML entities # - Convert "\&" into "&" (I forgot why I had to do this!) # - Convert escaped ASCII values into HTML entities # - Convert escaped nonalphanumeric characters into literal entities # - Convert $$ ... $$ into displayed math # - Convert $ ... $ into in-line math # - Convert any named commands that are not to be directly translated # into HTML tags. # Tasks should be done in the order above, because the result of one # pass might be processed again by a later pass. sub firstPass { my ( $src ) = @_; $src = translateVerbatim( $src ); $src = protectTagArgs( $src ); $src =~ s/([<>&])/$char2entity{$1}/g; $src =~ s/\\($specialChars)/$char2entity{$1}/go; $src =~ s/\\\&/\&/g; $src =~ s/\\(\d+)/\&#$1;/g; $src =~ s/\\(\W)/"\&\#".ord($1).";"/eg; $src =~ s/$displayMathenv/\\display\{$1\}/gos; $src =~ s/$mathenv/convertMath($1)/goes; $src =~ s/$commandChars/convertCommandChar($1,$2)/goes; return $src; } # Multipass bottom-up parsing/conversion. # Each pass converts the innermost commands into HTML tags or, if # there is an argument, "argument", where "..." is given # by the optional (square bracketed) argument, if any. # Examples: # \hr # is converted to #
# and # \b{this is bold} # is converted to # this is bold # and # \a[HREF="http://www.google.com"]{Google} # is converted to # Google # The passes stop when there are no more commands to convert. sub secondPass { my ( $src ) = @_; 1 while $src =~ s/$command/convertCommand($1,$2,$3)/ego; return $src; } sub protectTagArgs { my ( $src ) = @_; my $ret = ''; my $tagArgCount = 1; while ( $src =~ /$tagArgs/so ) { push @tagArgList, $2; $ret .= $` . $1 . $tagArgCount++ . ']'; $src = $'; } $ret .= $src; return $ret; } # Convert text in a math environment (between $ ... $ or $$ ... $$) # Basically, this just means italicizing any letters. sub convertMath { my ( $src ) = @_; $src =~ s/\s+/ /g; $src =~ s/\s*([=]|<|>|\\leq|\\geq)\s*/ $1 /g; $src =~ s/$commandChars/convertCommandChar($1,$2)/goes; $src =~ s/([\\&]?)([A-Za-z]+)(;?)/convertMathSymb($1,$2,$3)/ge; return $src; } sub convertMathSymb { my ( $pre, $text, $post ) = @_; # print "symb: " . $pre . $text . $post . "\n"; return $pre . $text . $post if ( $pre eq '&' && $post eq ';' ) || $pre eq '\\'; return $pre . "\\i\{" . $text . "\}" . $post; } sub convertCommandChar { my ( $char, $arg ) = @_; return "\\$char2command{ $char }" . ( $arg eq '{' ? '{' : "\{$arg\}" ); } # Convert anything between the line # \verbatim # and # EOV # To verbatim text. The "EOV" signal can be altered to be the text # of the optional argument to \verbatim. sub translateVerbatim { my ( $src ) = @_; my $ret = ''; my $verbatimEnd; my $tabStop = -1; # print "'$src'\n"; # print "'$defaultVerbatimEnd'\n"; while ( $src =~ /$verbatim/so ) { # print "'$`' . '$&' . '$'\n"; $ret .= $`; $src = $'; # print "`$1' `$2' `$3'\n"; if ( $1 ) { $verbatimEnd = "\n$2\n"; } else { $verbatimEnd = $defaultVerbatimEnd; } $tabStop = $3 if $3; # print "`$verbatimEnd'\n"; die "No end to verbatim environment: '$src'\n" if $src !~ /$verbatimEnd/s; $src = $'; $ret .= verb2quote( $`, $tabStop ); } $ret .= $src; return $ret; } sub verb2quote { my ( $text, $tabStop ) = @_; my $tab; my $i; if ( $tabStop >= 0 ) { $tab = ' ' x $tabStop; while ( $text =~ s/((?:^|\n)\t*)\t/\1$tab/gs ) {} } $text =~ s/$specialChars/\\$&/go; return "\\br\n\\tt\{\\br\n$text\\br\n\}\\br\n"; } sub convertCommand { my ( $name, $option, $args ) = @_; my @fields = (); # print "convertCommand\n name=$name\n option=$option\n args=$args\n"; $option =~ s/^\[//; $option =~ s/\]$//; # $option = " $option" if $option; die "Mistranslated tag option: $option ($!)\n" if $option && ($option !~ /^\d+$/s || $option > $#tagArgList+1); $option = " $tagArgList[$option-1]" if $option; # Get the array of fields while ( $args =~ /^$field/o ) { my $tmp = $&; $args = $'; $tmp =~ s/^\{//; $tmp =~ s/\}$//; push @fields, $tmp; } # All of $args are in @fields, with the braces stripped off $name = lc $name; my $thisCommand = $commands{ $name }; # print "command $name is being processed\n"; # print "command $name is defined\n" if defined $thisCommand; return &$thisCommand( $option, \@fields ) if defined $thisCommand; do { warn "Too many arguments to $name command\n"; return undef; } if $#fields > 0; $name = uc $name; my $ret = "<$name$option>"; if ( @fields ) { $ret .= $fields[0] . ""; } return $ret; } sub p { my ( $option, $fields ) = @_; my $src = $fields->[0]; $src =~ s/\n\n/<\/P>\n/g; return "$src

\n"; } sub tabular { my ( $option, $fields ) = @_; my $src = $fields->[0]; if ( $option ) { $option =~ s/^\s*//; $option = "\[$option\]"; } my $ret = "\\table$option\{\n"; while ( $src =~ s/^\s*$tableMark$tableMark\s*//so ) { my ( $trOpt, $tdOpt ) = ( $1, $2 ); my $tableRow; $ret .= " \\tr$trOpt\{\n \\td$tdOpt\{"; if ( $src =~ /\s*$tableMark$tableMark\s*/ ) { $tableRow = $`; $src = $& . $'; } else { $tableRow = $src; $tableRow =~ s/\s*$//; $src = ''; } while ( $tableRow =~ /\s*$tableMark\s*/so ) { $ret .= "$`\}\n \\td$1\{"; $tableRow = $'; } $tableRow =~ s/\s*$//; $ret .= "$tableRow\}\n \}\n"; } $ret .= "\}"; return secondPass( $ret ); } sub entity { my ( $option, $fields ) = @_; my $src = $fields->[0]; return "\&$src;"; } sub display { my ( $option, $fields ) = @_; my $src = $fields->[0]; # print "display: $src\n"; return "\\center\{\\br\n$src\\br\\br\n\}"; } my $gotTime = 0; my $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst; sub today { my $ret; if ( !$gotTime ) { ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; $gotTime = 1; } $ret = ('Sunday','Monday','Tuesday','Wednesday','Thursday', 'Friday','Saturday')[$wday]; $ret .= " "; $ret .= ('January','February','March','April','May','June','July', 'August','September','October','November','December')[$mon]; $ret .= " $mday, "; $ret .= $year + 1900; return $ret; } sub now { my $ret; if ( !$gotTime ) { ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; $gotTime = 1; } $ret = "${hour}:${min}:${sec} "; if ( $isdst ) { $ret .= "EDT"; } else { $ret .= "EST"; } return $ret; } sub comment { my ( $option, $fields ) = @_; my $src = $fields->[0]; return ""; } # Check for unmatched delimiters. sub check { my ( $src ) = @_; my $isError = ''; while ( $src =~ /[{}\[\]]|\\\w+\{?|\|+|\$+/g ) { $isError = 1; warn "Unparsed `$&' found:\n"; warn " `" . substr($`,-20) . $& . substr($',0,20) . "'\n"; } return $isError; }