#! /usr/bin/perl
# Written by Stephen Fenner
# Started 9/17/98 (1.0)
# 1.1 9/18/01
# 1.2 11/10/09 (protected tag arguments from unwanted translation)
#
# Copyright 2001 by Stephen A. Fenner. This software may be used and
# modified freely, provided these comments are preserved in their entirety.
# All other rights are reserved.
# Please let me know of any enhancements or bugfixes
# you may have by email to fenner.sa@gmail.com. Thanks.
#
# Script for converting handouts in a pseudo-LaTeX format into html
# Constants
$program = "p2h";
$version = 1.1;
$defaultVerbatimEnd = "\nEOV\n";
# A null tag is only used to protect its argument(s) from translation and
# should disappear.
# Usage: \nulltag[text to be protected from translation]
# Here, we set the name of the null tag:
$nullTagName = 'NULLTAG';
# Useful regexps
# Stuff from an opening to a closing curly brace, with no curly braces
# in between.
$field = '(?:\{[^{}]*\})';
# Matches the optional argument of a command
$optArg = '\[[^\]]*\]';
# A backslash followed by an identifier, followed by an optional
# argument (in square brackets), followed by zero or more fields.
$command = "(?:\\\\(\\w+)(?!\\w)((?!\\[)|$optArg)((?!\{)|$field+))";
# A whole line containing a verbatim command, which must start the line,
# followed by an optional argument, followed by the rest of the line,
# which must consist of zero of more decimal digits.
$verbatim = '(?:(?:^|\n)\\\\(?:verbatim|VERBATIM)(\[([^\]]+)\])?(\d*)\n)';
# Characters to be treated specially.
$specialChars = '[\ $\t\n{}\[\]~\\\\_^"|' . "'" . ']';
# Stuff running from "$$" to "$$", with no "$" (but at least one
# character) in between.
$displayMathenv = '\$(\$[^$]+\$)\$';
# Stuff running from "$" to "$" with no "$" (but at least one character)
# in between.
$mathenv = '\$([^$]+)\$';
# Single non-identifier characters used as command names without a
# preceding backslash, e.g., subscript and superscript. (The next
# character is also grabbed so if it is not an opening curly brace,
# it can serve as the one-character argument, e.g., "x^2".)
$commandChars = '([_^])(.)';
# Single or double vertical bars that start table entries or rows,
# respectively. (A row starter also starts the first entry in the
# row.)
$tableMark = '(?:\|((?:\[[^\]]*\])?))';
# the following translations are done when the given character is
# preceded by a backslash.
%char2entity =
(
' ' => ' ',
'$' => '$',
"\t" => ' ',
"\n" => "\\br\n",
'{' => '{',
'}' => '}',
'[' => '[',
']' => ']',
'~' => '~',
'\\' => '\',
'_' => '_',
'^' => '^',
'&' => '&',
'<' => '<',
'>' => '>',
# '"' => '"',
'"' => '"',
'|' => '|',
"'" => '\\acute' # requires braces for single-letter argument
);
# these characters translate to commands with no preceding backslash.
%char2command =
(
'_' => 'sub',
'^' => 'sup',
);
# Any alphabetic commands not on this list are assumed to be HTML tag
# names and convert to HTML tags with arguments given in square brackets.
# Names are chosen to correspond with LaTeX as much as I am familiar.
# Many of these entities (especially for HTML 4.x) were found at
# http://www.alanwood.net/demos/ent4_frame.html.
# Each key in the table below is associated with a function that
# reads the arguments of the command (if any) and outputs the text
# to replace the command. (This text may be processed further in
# a subsequent pass.)
%commands =
(
# def => \&def,
p => \&p,
tabular => \&tabular,
entity => \&entity,
today => \&today,
now => \&now,
acute => \´,
comment => \&comment,
# assignment => \&assignment,
# course => \&course,
# due => \&due,
# section => \§ion,
# subsection => \&subsection,
# list => \&list,
# examples => \&examples,
# math => \&math,
display => \&display,
# leq => sub { return "\\u\{\<\}"; },
# geq => sub { return "\\u\{\>\}"; },
leq => sub { return "\\le"; },
geq => sub { return "\\ge"; },
neq => sub { return "\\ne"; },
log => sub { return "log"; },
lg => sub { return "lg"; },
det => sub { return "det"; },
divop => sub { return "div"; }, # div conflicts with tag name
mod => sub { return "mod"; },
gcd => sub { return "gcd"; },
lcm => sub { return "lcm"; },
trace => sub { return "tr"; },
domain => sub { return "dom"; },
range => sub { return "rng"; },
codomain => sub { return "cod"; },
# ldots => sub { return "..."; },
ldots => sub { return "&hellip"; },
# cdots => sub { return "..."; },
cdots => sub { return "\\cdot\\cdot\\cdot" },
floor => sub { return "floor"; },
ceiling => sub { return "ceiling"; },
min => sub { return "min"; },
max => sub { return "max"; },
# iff => sub { return "<==>"; },
iff => sub { return "\\cleftrightarrow"; },
# implies => sub { return "==>"; },
implies => sub { return "\\crightarrow"; },
# exists => sub { return "exists"; },
# forall => sub { return "for\\ all"; },
exists => sub { return "∃"; },
forall => sub { return "∀"; },
pr => sub { return "Pr"; },
mid => sub { return " : "; },
in => sub { return "∈"; },
notin => sub { return "∉"; },
# and => sub { return "and"; },
# or => sub { return "or"; },
and => sub { return "\\wedge"; },
or => sub { return "\\vee"; },
# Capital Greek letters.
# Unlike TeX, our command names are not case sensitive,
# so we can't just use "\Sigma" for example. So we prefix with "c" instead.
calpha => sub { return "Α"; },
cbeta => sub { return "Β"; },
cgamma => sub { return "Γ"; },
cdelta => sub { return "Δ"; },
cepsilon => sub { return "Ε"; },
czeta => sub { return "Ζ"; },
ceta => sub { return "Η"; },
ctheta => sub { return "Θ"; },
ciota => sub { return "Ι"; },
ckappa => sub { return "Κ"; },
clambda => sub { return "Λ"; },
cmu => sub { return "Μ"; },
cnu => sub { return "Ν"; },
cxi => sub { return "Ξ"; },
comicron => sub { return "Ο"; },
cpi => sub { return "Π"; },
crho => sub { return "Ρ"; },
csigma => sub { return "Σ"; },
ctau => sub { return "Τ"; },
cupsilon => sub { return "Υ"; },
cphi => sub { return "Φ"; },
cchi => sub { return "Χ"; },
cpsi => sub { return "Ψ"; },
comega => sub { return "Ω"; },
# Small Greek letters
alpha => sub { return "α"; },
beta => sub { return "β"; },
gamma => sub { return "γ"; },
delta => sub { return "δ"; },
epsilon => sub { return "ε"; },
zeta => sub { return "ζ"; },
eta => sub { return "η"; },
theta => sub { return "θ"; },
thetasym => sub { return "ϑ"; },
iota => sub { return "ι"; },
kappa => sub { return "κ"; },
lambda => sub { return "λ"; },
mu => sub { return "μ"; },
nu => sub { return "ν"; },
xi => sub { return "ξ"; },
omicron => sub { return "ο"; },
pi => sub { return "π"; },
rho => sub { return "ρ"; },
sigma => sub { return "σ"; },
sigmaf => sub { return "ς"; },
tau => sub { return "τ"; },
upsilon => sub { return "υ"; },
upsih => sub { return "ϒ"; },
phi => sub { return "φ"; },
varphi => sub { return "ϕ"; },
chi => sub { return "χ"; },
psi => sub { return "ψ"; },
omega => sub { return "ω"; },
piv => sub { return "ϖ"; },
# other math symbols from WGL4 HTML
prime => sub { return "′"; },
doubleprime => sub { return "″"; },
neg => sub { return "¬"; },
pm => sub { return "±"; },
middot => sub { return "·"; }, # a little higher
cdot => sub { return "⋅"; }, # a little lower
half => sub { return "½"; },
quart => sub { return "¼"; },
threequart => sub { return "¾"; },
times => sub { return "×"; },
divide => sub { return "÷"; },
dagger => sub { return "†"; },
ddagger => sub { return "‡"; }, # double dagger
permille => sub { return "‰"; }, # per thousandth
fracslash => sub { return "⁄"; },
aleph => sub { return "ℵ"; },
partial => sub { return "∂"; },
emptyset => sub { return "∅"; },
dell => sub { return "∇"; },
ni => sub { return "∋"; }, # backwards \in
sum => sub { return "∑"; },
prod => sub { return "∏"; },
minus => sub { return "−"; },
lowast => sub { return "∗"; }, # asterisk operator
sqrt => sub { return "√"; },
propto => sub { return "∝"; },
infty => sub { return "∞"; },
wedge => sub { return "∧"; },
vee => sub { return "∨"; },
cap => sub { return "∩"; },
cup => sub { return "∪"; },
integral => sub { return "∫"; },
sim => sub { return "∼"; }, # tilde operator
cong => sub { return "≅"; }, # tilde over equals
approxeq => sub { return "≈"; }, # tilde over tilde
neq => sub { return "≠"; },
equiv => sub { return "≡"; }, # three horizontal lines
le => sub { return "≤"; },
ge => sub { return "≥"; },
ne => sub { return "≠"; },
subset => sub { return "⊂"; },
supset => sub { return "⊃"; },
notsubset => sub { return "⊄"; },
subseteq => sub { return "⊆"; },
supseteq => sub { return "⊇"; },
oplus => sub { return "⊕"; },
otimes => sub { return "⊗"; },
bot => sub { return "⊥"; }, # also means perpendicular
reals => sub { return "ℜ"; },
imags => sub { return "ℑ"; },
weierp => sub { return "℘"; }, # Weierstrass script P
lceil => sub { return "⌈"; },
rceil => sub { return "⌉"; },
lfloor => sub { return "⌊"; },
rfloor => sub { return "⌋"; },
langle => sub { return "〈"; },
rangle => sub { return "〉"; },
leftarrow => sub { return "←"; },
rightarrow => sub { return "→"; },
uparrow => sub { return "↑"; },
downarrow => sub { return "↓"; },
leftrightarrow => sub { return "↔"; },
cleftarrow => sub { return "⇐"; },
crightarrow => sub { return "⇒"; },
cuparrow => sub { return "⇑"; },
cdownarrow => sub { return "⇓"; },
cleftrightarrow=> sub { return "⇔"; },
# Other miscellaneous characters
emdash => sub { return "—"; },
endash => sub { return "–"; },
cae => sub { return "Æ"; },
ae => sub { return "æ"; },
coe => sub { return "Œ"; },
oe => sub { return "œ"; },
ss => sub { return "ß"; },
euro => sub { return "€"; },
cent => sub { return "¢"; }, # cent sign
pound => sub { return "£"; }, # British pound
copyright => sub { return "©"; },
tm => sub { return "™"; }, # trademark
reg => sub { return "®"; }, # registered trademark
lozenge => sub { return "◊"; }, # skinny white diamond
spades => sub { return "♠"; },
clubs => sub { return "♣"; },
hearts => sub { return "♥"; },
diamonds => sub { return "♦"; },
# glossary => \&glossary,
# htmltag => \&htmltag,
);
#sub main
{
my $filename;
my $basename;
my $verbose = '';
# parse command line arguments
while ( @ARGV ) {
$filename = shift @ARGV;
if ( $filename eq '-' ) {
$filename = ''; # standard input to standard output
}
elsif ( $filename eq '-v' ) {
$verbose = 1;
next;
}
else {
$filename .= '.ptex' if $filename !~ /\.ptex$/;
$basename = $filename;
$filename = " $filename";
$basename =~ s/\.ptex$//;
if ( -e "$basename.html" ) {
my $rc = system "mv", "$basename.html", "$basename.html.bak";
die "$program: backup failed (code = $rc), ($!)\n"
if $rc >> 8;
}
open( OUT, "> $basename.html" )
or die "Cannot open $basename.html for writing ($!)\n";
}
print STDERR "$program: processing$filename ... " if $verbose;
# grab the entire source file at once
my $src = `cat$filename`;
chomp $src;
$src = firstPass( $src );
$src = secondPass( $src );
my $isError = check( $src );
if ( $isError ) {
close( OUT ) if $filename;
die "\nThere were errors.\n";
}
$src = restoreTagArgs( $src );
if ( $filename ) {
print OUT "$src\n";
close( OUT );
chmod 0644, "$basename.html";
}
else {
print "$src\n";
}
print STDERR "done\n" if $verbose;
}
}
# One pass through the entire source is used for EACH of the following tasks:
# - Convert verbatim sections into quote sections
# - Convert unescaped HTML metacharacters into literal characters
# - Convert escaped special characters into HTML entities
# - Convert "\&" into "&" (I forgot why I had to do this!)
# - Convert escaped ASCII values into HTML entities
# - Convert escaped nonalphanumeric characters into literal entities
# - Convert $$ ... $$ into displayed math
# - Convert $ ... $ into in-line math
# - Convert any named commands that are not to be directly translated
# into HTML tags.
# Tasks should be done in the order above, because the result of one
# pass might be processed again by a later pass.
sub firstPass
{
my ( $src ) = @_;
# turn verbatim environments into literal text
$src = translateVerbatim( $src );
# gather optional arguments (tag arguments) into an array
$src = gatherTagArgs( $src );
# literalize things that can be mistaken as tag delimiters or
# entity starters, but are not (and are not escaped)
$src =~ s/([<>&])/$char2entity{$1}/g;
# literalize other special chars (escaped by backslash) to entities
$src =~ s/\\($specialChars)/$char2entity{$1}/go;
# not sure why this is here
$src =~ s/\\\&/\&/g;
# convert escaped ASCII codes
$src =~ s/\\(\d+)/\$1;/g;
# convert other escaped strange chars to entities
$src =~ s/\\(\W)/"\&\#".ord($1).";"/eg;
# convert displayed math enviroments ($$ ... $$)
$src =~ s/$displayMathenv/\\display\{$1\}/gos;
# convert other math environments ($ ... $)
$src =~ s/$mathenv/convertMath($1)/goes;
# convert nonescaped commands
$src =~ s/$commandChars/convertCommandChar($1,$2)/goes;
return $src;
}
# Multipass bottom-up parsing/conversion.
# Each pass converts the innermost commands into HTML tags or, if
# there is an argument, "
/g; return "
$src
\n"; } sub tabular { my ( $option, $fields ) = @_; my $src = $fields->[0]; if ( $option ) { $option =~ s/^\s*//; $option = "\[$option\]"; } my $ret = "\\table$option\{\n"; while ( $src =~ s/^\s*$tableMark$tableMark\s*//so ) { my ( $trOpt, $tdOpt ) = ( $1, $2 ); my $tableRow; $ret .= " \\tr$trOpt\{\n \\td$tdOpt\{"; if ( $src =~ /\s*$tableMark$tableMark\s*/ ) { $tableRow = $`; $src = $& . $'; } else { $tableRow = $src; $tableRow =~ s/\s*$//; $src = ''; } while ( $tableRow =~ /\s*$tableMark\s*/so ) { $ret .= "$`\}\n \\td$1\{"; $tableRow = $'; } $tableRow =~ s/\s*$//; $ret .= "$tableRow\}\n \}\n"; } $ret .= "\}"; return secondPass( $ret ); } sub entity { my ( $option, $fields ) = @_; my $src = $fields->[0]; return "\&$src;"; } sub display { my ( $option, $fields ) = @_; my $src = $fields->[0]; # print "display: $src\n"; return "\\center\{\\br\n$src\\br\\br\n\}"; } my $gotTime = 0; my $sec, $min, $hour, $mday, $mon, $year, $wday, $yday, $isdst; sub today { my $ret; if ( !$gotTime ) { ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; $gotTime = 1; } $ret = ('Sunday','Monday','Tuesday','Wednesday','Thursday', 'Friday','Saturday')[$wday]; $ret .= " "; $ret .= ('January','February','March','April','May','June','July', 'August','September','October','November','December')[$mon]; $ret .= " $mday, "; $ret .= $year + 1900; return $ret; } sub now { my $ret; if ( !$gotTime ) { ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; $gotTime = 1; } $hour = '0' . $hour if length($hour) < 2; $min = '0' . $min if length($min) < 2; $sec = '0' . $sec if length($sec) < 2; $ret = "${hour}:${min}:${sec} "; if ( $isdst ) { $ret .= "EDT"; } else { $ret .= "EST"; } return $ret; } sub acute { my ( $option, $fields ) = @_; my $src = $fields->[0]; return "\&$src" . "acute;"; } sub comment { my ( $option, $fields ) = @_; my $src = $fields->[0]; return ""; } # Check for unmatched delimiters. sub check { my ( $src ) = @_; my $isError = ''; while ( $src =~ /[{}\[\]]|\\\w+\{?|\|+|\$+/g ) { $isError = 1; warn "Unparsed `$&' found:\n"; warn " `" . substr($`,-20) . $& . substr($',0,20) . "'\n"; } return $isError; }