#Time-stamp: "2004-12-29 18:34:27 AST" #TODO: for xml2pod: # Make utf8/Latin-1 an option (default utf8?) # Make E<>ification an option (default to all) # Option for whether to delete highbit things in codeblocks (default: no?) #TODO: for pod2xml: # Option: choice of XML encoding (Latin-1 or UTF-8) # Option: whether to represent things as literals, or as numeric entities. # (and whether to use decimal entities, or hex??) require 5; package Pod::PXML; use strict; use vars qw($VERSION $XMLNS %Char2podent %Char2xmlent $LATIN_1 $XML_VALIDATE $LINK_TEXT_INFER $FUSE_ADJACENT_PRES $HIGH_BIT_OK ); $XMLNS = 'http://www.perl.com/CPAN/authors/id/S/SB/SBURKE/pxml_0.01.dtd'; $VERSION = '0.12'; # I'm going to try to keep the major version numbers in the DTD and the # module in synch. I dunno about the fractional part, tho. $LATIN_1 = 1; $XML_VALIDATE = 1; $HIGH_BIT_OK = 0; $LINK_TEXT_INFER = 0; $FUSE_ADJACENT_PRES = 1; # Whether to make " foo\n\n bar" as a single PRE, # as if it were from " foo\n \n bar\n\n" # TODO: set to 1 BEGIN { *DEBUG = sub () {0} unless defined &DEBUG } my $nil = []; use Carp; use utf8; # POD entities are just HTML entities plus verbar and sol #------------------------------------------------------------------------ # Fill out Char2podent, Char2xmlent. { use HTML::Entities (); die "\%HTML::Entities::char2entity is empty?" unless keys %HTML::Entities::char2entity; my($c,$e); while(($c,$e) = each(%HTML::Entities::char2entity)) { if($e =~ m{^&#(\d+);$}s) { $Char2podent{ord $c} = "E<$1>"; #print "num $e => E<$1>\n"; # { => E<123> # $Char2xmlent{ord $c} = $e; } elsif($e =~ m{^&([^;]+);$}s) { $Char2podent{ord $c} = "E<$1>"; #print "eng $e => E<$1>\n"; # é => E # $Char2xmlent{ord $c} = $e; } else { warn "Unknown thingy in %HTML::Entities::char2entity: $c => $e" # if $^W; } } # Points of difference between HTML entities and POD entities: $Char2podent{ord "\xA0"} = "E<160>"; # there is no E $Char2podent{ord "\xAB"} = "E"; $Char2podent{ord "\xBB"} = "E"; # Altho new POD processors also know E and E # Old POD processors don't know these two -- so leave numeric # $Char2podent{ord '/'} = 'E'; # $Char2podent{ord '|'} = 'E'; # And a few that we have to make completely sure are present. $Char2xmlent{ord '"'} = '"' ; $Char2xmlent{ord '<'} = '<' ; $Char2xmlent{ord '>'} = '>' ; $Char2podent{ord '<'} = 'E' ; $Char2podent{ord '>'} = 'E' ; } #print STDERR "Sanity: 214 is ", $Char2podent{214}, "\n"; #------------------------------------------------------------------------ sub pod2xml ($) { require Pod::Tree; my $content = $_[0]; my $tree = Pod::Tree->new; if(ref($content) eq 'SCALAR') { $tree->load_string($$content); } else { $tree->load_file($content); } unless($tree->loaded) { croak("Couldn't load pod") } return _pod_tree_as_xml($tree); } #------------------------------------------------------------------------ # Real work: sub _pod_tree_as_xml { my $root = $_[0]->get_root; DEBUG > 2 and print "TREE DUMP: <<\n", $_[0]->dump, ">>\n\n"; return "\n\n" unless $root; my $out = ''; my $trav; my $x; # scratch $trav = sub { my $it = $_[0]; my $type = $it->get_type; my $post = ''; DEBUG and print "Hitting $type\n"; if($type eq 'root') { $out .= join "\n", qq{}, qq{}, qq{}, "", '', '', ; $post = "\n"; # harmless newline, I figure. } elsif($type eq 'for') { $out .= "get_arg) . "\">"; $out .= xml_escape_maybe_cdata($it->get_text); $out .= "\n\n"; return; } elsif($type eq 'sequence') { $type = lc($it->get_letter); DEBUG and print "Sequence type \"$type\"\n"; if($type eq 'e') { # An unresolved entity. $x = $it->get_children; if($x and @$x ==1 and $x->[0]->get_type eq 'text') { $x = $x->[0]->get_text; die "Impossible entity name \"$x\"" if $x =~ m/[ \t<>]/s; # minimal sanity $out .= '&' . $x . ';'; } else { # $out .= '&WHAT;'; die "Aberrant E<..> content \"", $it->get_deep_text, "\""; } return; } elsif($type eq 'l') { # At time of writing, Pod::Tree is less than sterling in its # treatment of L<...> sequences. #use Data::Dumper; #print "LINK DUMP: {{\n", Dumper($it), "}}\n"; # Some special treatment... my $target = $it->get_target || die 'targetless link?'; my($page, $section); $out .= "get_page ); $out .= " page=\"$page\"" if length $page; $section = xml_attr_escape( $target->get_section ); $out .= " section=\"$section\"" if length $section; $out .= ">"; #if(!$LINK_TEXT_INFER and not(($x = $target->get_children) and @$x)) { unless(($x = $target->get_children) and @$x) { # There was no gloss (i.e., the bit after the "|"). if(! $LINK_TEXT_INFER) { # subvert the normal processing of children of this sequence. $out .= ""; return; } else { # Infer the text instead. my $ch; if(($ch = $it->get_children) and @$ch == 1 and $ch->[0]->get_type eq 'text' ) { # So this /is/ just some text bit that Pod::Tree implicated. # To replicate Pod::Text's inscrutible weirdness as # best we can, for sake of continuity if not actual # good sense or clarity. # The moral of the story is to always have L !!! $x = ''; if (!length $section) { $x = "the $page manpage" if length $page; } elsif ($section =~ m/^[:\w]+(?:\(\))?/) { $x .= "the $section entry"; $x .= (length $page) ? " in the $page manpage" : " elsewhere in this document"; } else { $section =~ s/^\"\s*//; $section =~ s/\s*\"$//; $x .= 'the section on "' . $section . '"'; $x .= " in the $page manpage" if length $page; } $out .= "$x"; return; # subvert the usual processing. } # Else it's complicated and scary. Fall thru. } } $post = ''; } else { # Unknown sequence. Ahwell, pass thru. $out .= "<$type>"; $post = ""; } } elsif($type eq 'list') { $x = xml_attr_escape($it->get_arg); $out .= length($x) ? "\n\n" : "\n\n"; # used to have: # sprintf "\n\n", # xml_attr_escape($it->get_list_type), # xml_attr_escape($it->get_arg) ; $post = "\n\n"; } elsif($type eq 'ordinary') { $out .= "

"; $post = "

\n\n"; } elsif($type eq 'command') { $x = $it->get_command(); if($x =~ m/^head[1234]$/is) { $x = lc($x); $out .= "<$x>"; $post = "\n\n"; } else { die "Unknown POD command \"$x\""; } } elsif($type eq 'item') { # Needs special recursion! $out .= ''; # used to have: sprintf '', # xml_attr_escape($it->get_item_type); # Recurse for the item's children: foreach my $c (@{ $it->get_children || $nil }) { $trav->($c) } $out .= "\n\n"; # Then recurse for the bastards further down... } elsif($type eq 'verbatim') { ( $FUSE_ADJACENT_PRES and $out =~ s/<\/pre>\n\n$//s ) or $out .= "
";
       # possibly combine adjacent verbatims into a single 'pre'
      $out .= xml_escape_maybe_cdata("\n" . $it->get_text . "\n");
      $out =~ s/]]>$/s;
       # combining adjacent CDATA sections is nice, and always harmless
      $out .= "
\n\n"; return; } elsif($type eq 'text') { $out .= xml_escape($it->get_text); return; } else { $out .= "\n\n"; return; } foreach my $c (@{ # Recurse... (($type eq 'item') ? $it->get_siblings() : $it->get_children()) || $nil }) { $trav->($c) } $out .= $post; return; }; $trav->($root); undef $trav; # break cyclicity print "\n\n" if DEBUG; sanitize_newlines($out); return $out; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - sub xml_escape_maybe_cdata { # not destructive my $x; $x = '' unless defined($x = $_[0]); if($x =~ m/[&<>]/ and not $x =~ m/[^\x00-\x80]/) { # CDATA only if uses those [&<>], and does not use anything highbit. $x =~ s/]]>/]]>]]>"; } else { # Otherwise escape things. $x =~ s/&/&/g; $x =~ s//>/g; #$x =~ s/([^\x00-\x7E])/$Char2xmlent{ord $1} or "&#".ord($1).";"/eg; $x =~ s/([^\x00-\x7E])/"&#".ord($1).";"/eg unless $HIGH_BIT_OK; # Why care about highbittyness? Even tho we're declaring this content # to be in UTF8, might as well entitify what we can. } return $x; } sub xml_escape { # not destructive my $x; return '' unless defined($x = $_[0]); if($HIGH_BIT_OK) { $x =~ s/([&<>])/$Char2xmlent{ord $1} or "&#".ord($1).";"/eg; # Encode '&', and '<' and '>' } else { $x =~ s/([^\cm\cj\f\t !-%'-;=?-~])/$Char2xmlent{ord $1} or "&#".ord($1).";"/eg; # Encode control chars, high bit chars, '&', and '<' and '>' } return $x; } sub xml_attr_escape { # not destructive my $x; return '' unless defined($x = $_[0]); if($HIGH_BIT_OK) { $x =~ s/([&<>"])/$Char2xmlent{ord $1} or "&#".ord($1).";"/eg; # Encode '&', '"', and '<' and '>' } else { $x =~ s/([^\cm\cj\f\t !\#-\%'-;=?-~])/$Char2xmlent{ord $1} or "&#".ord($1).";"/eg; # Encode control chars, high bit chars, '"', '&', and '<' and '>' } return $x; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - sub sanitize_newlines { # DESTRUCTIVE if("\n" eq "\cm") { $_[0] =~ s/\cm?\cj/\n/g; # turn \cj and \cm\cj into \n } elsif("\n" eq "\cj") { $_[0] =~ s/\cm\cj/\n/g; # turn \cm and \cm\cj into \n } else { $_[0] =~ s/(?:(?:\cm?\cj)|\cm)/\n/g; # turn \cm\cj, \cj, or \cm into \n } return; } ########################################################################### ########################################################################### use vars qw(%Acceptable_children); { # This just recapitulates what's in the DTD: my $style = {map{;$_,1} qw(b i c x f s link)}; my $pstyle = {'#PCDATA',1, %$style}; my $pcdata = {'#PCDATA',1}; %Acceptable_children = ( 'pod' => {map{;$_,1} qw(head1 head2 head3 head4 p pre list for)}, map(($_=>$pstyle), qw(head1 head2 head3 head4 p)), 'pre' => $pcdata, 'list' => {map{;$_,1} qw(item p pre list for)}, 'item' => $pstyle, 'for' => $pcdata, map(($_=>$pstyle), qw(link b i c f x s)), ); } sub xml2pod ($) { my $content = $_[0]; require XML::Parser; my $out; my($gi, %attr, $text, $cm_set); # scratch my(@stack); my @paragraph_stack; # pop/pushed only by paragraph-containing elements, and link my @for_stack; # kept by 'for' elements my @link_stack; # kept by 'link' elements my $xml = XML::Parser->new( 'Handlers' => { ## ## ## On the way in... 'Start' => sub { (undef, $gi, %attr) = @_; push @stack, $gi; DEBUG > 1 and print ' ', join('.', @stack), "+\n"; if($XML_VALIDATE) { if(@stack < 2) { unless($gi eq 'pod') { # I think XML::Parser would catch this, but anyway. die "Can't have a childless \"$gi\" element, in $content"; } } elsif(defined($cm_set = $Acceptable_children{$stack[-2]})) { die "Can't have a \"$gi\" in a \"$stack[-2]\", in $content (stack @stack)" unless $cm_set->{$gi}; } else { die "Unknown element \"$gi\""; } # TODO: attribute validation! } if($gi =~ m/^[bicxfs]$/s) { $paragraph_stack[-1] .= "\U$gi<"; } elsif($gi eq 'p' or $gi eq 'pre') { push @paragraph_stack, ''; } elsif($gi eq 'for') { $text = $attr{'target'} || '????'; push @for_stack, $text; push @paragraph_stack, ''; } elsif($gi eq 'list') { $text = $attr{'indent'}; $out .= (defined($text) && length($text)) ? "=over $text\n\n" : "=over\n\n"; } elsif($gi eq 'item') { $out .= '=item '; push @paragraph_stack, ''; } elsif($gi =~ m/^head[1234]$/s) { push @paragraph_stack, '=' . $gi . ' '; } elsif($gi eq 'link') { # a hack push @link_stack, [$attr{'page'}, $attr{'section'}]; push @paragraph_stack, ''; } elsif($gi eq 'pod') { my $text = $attr{'xmlns'} || $XMLNS; die "pod has a foreign namespace: \"$text\" instead of \"$XMLNS\"" unless $text eq $XMLNS; } else { DEBUG and print "Opening unknown element \"$gi\"\n"; } return; }, ## ## ## And on the way out... 'End' => sub { $gi = $_[1]; DEBUG > 1 and print ' ', join('.', @stack), "-\n"; die "INSANE! Stack mismatch! $text ne $gi" unless $gi eq ($text = pop @stack); if($gi =~ m/^[bicxfs]$/s) { $paragraph_stack[-1] .= ">"; } elsif($gi eq 'p') { # A paragraph must start with non-WS, non-=, and must contain # no \n\n's until its very end. $text = pop @paragraph_stack; $text =~ s/^(\s)/Z<>$1/s; # make sure we're NOT indented $text =~ s/^=/Z<>=/s; # make sure we're NOT =-initial $text =~ s/\n+$//s; # nix terminal newlines! $text =~ s/\n(?=\n)/\n /g; # separate double-newlines unless(length $text) { DEBUG and print "Odd, null p-paragraph\n"; return; } # These don't beautify /everything/ beautifiable, but they try. while($text =~ s/([^a-zA-Z<])E/$1's that obviously don't need escaping, back into <'s while($text =~ s/^([^<]*)E/$1>/) {1} # Turn obviously harmless E's back into ">"'s. $text .= "\n\n"; $out .= $text; } elsif($gi eq 'pre') { # A verbatim paragraph must start with WS, and must contain # no \n\n's until its very end. $text = pop @paragraph_stack; $text =~ s/^\n+//s; # nix leading strictly-blank lines $text =~ s/^(\S)/ \n$1/s; # make sure we ARE indented # that means we don't have to make sure we don't start with a '=' $text =~ s/\n+$//s; # nix terminal newlines! $text =~ s/\n(?=\n)/\n /g; # separate double-newlines #$text =~ tr/\0-\xFF//CU if $LATIN_1; # since we can't E<..> things unless(length $text) { DEBUG and print "Odd, null pre-paragraph\n"; return; } $text .= "\n\n"; $out .= $text; } elsif($gi eq 'for') { my $kind = pop @for_stack; $text = "\n\n=begin $kind\n\n" . pop @paragraph_stack; $text =~ s/\n+$//s; # nix terminal newlines! $text =~ s/\n(?=\n)/\n /g; # separate double-newlines $text .= "\n\n=end $kind\n\n"; $out .= $text; } elsif($gi eq 'list') { $out .= "=back\n\n"; } elsif($gi eq 'item') { $text = pop @paragraph_stack; $text =~ s/^\s*//s; # kill leading space $text =~ s/\n+$//s; # nix terminal newlines! $text =~ s/\n(?=\n)/\n /g; # separate double-newlines $text .= "\n\n"; # These don't beautify /everything/ beautifiable, but they try. while($text =~ s/([^a-zA-Z<])E/$1's that obviously don't need escaping, back into <'s while($text =~ s/^([^<]*)E/$1>/) {1} # Turn obviously harmless E's back into ">"'s. $out .= $text; } elsif($gi =~ m/^head[1234]$/s) { $text = pop @paragraph_stack; $text =~ s/^(\s)/Z<>$1/s; # make sure we're NOT (visibly) indented $text =~ s/\n+$//s; # nix terminal newlines! $text =~ s/\n(?=\n)/\n /g; # nix any double-newlines $text .= "\n\n"; # These don't beautify /everything/ beautifiable, but they try. while($text =~ s/([^a-zA-Z<])E/$1's that obviously don't need escaping, back into <'s while($text =~ s/^([^<]*)E/$1>/) {1} # Turn obviously harmless E's back into ">"'s. $out .= $text; } elsif($gi eq 'link') { # a hack $text = pop @paragraph_stack; # "Text cannot contain the characters '/' and '|'" $text =~ s/\|/E<124>/g; # AKA verbar $text =~ s{/}{E<47>}g; # AKA sol $text =~ s/\n(?=\n)/\n /g; # nix any double-newlines, just for good measure $text .= '|' if length $text; my($xref, $section) = @{pop @link_stack}; $xref = '' unless defined $xref; # "" means 'in this document' $section = '' unless defined $section; $xref = pod_escape($xref); $xref =~ s{/}{E<47>}g; $section = pod_escape("/\"$section\"") if length $section; $section = '/"???"' unless length $xref or length $section; # signals aberrant input! $paragraph_stack[-1] .= "L<$text$xref$section>"; } elsif($gi eq 'pod') { # no-op } else { DEBUG and print "Closing unknown element \"$gi\"\n"; } return; }, ## ## ## Character data! MATANGA!!! 'Char' => sub { shift; return unless defined $_[0] and length $_[0]; # sanity if(!@stack) { die "Non-WS text on empty stack: \"$_[0]\"" unless $_[0] =~ m/^\s+$/s; } else { if(($Acceptable_children{$stack[-1]} || die "Putting text under unknown element \"$stack[-1]\"" )->{'#PCDATA'}) { # This is the only case where we can add: die "\@paragraph_stack is empty? (stack: @stack)" unless @paragraph_stack; if($stack[-1] eq 'pre') { $paragraph_stack[-1] .= $_[0]; } else { $paragraph_stack[-1] .= pod_escape($_[0]); } } else { # doesn't allow PCDATA die "Can't have non-WS text in a \"$stack[-1]\"" unless $_[0] =~ m/^\s+$/s; # Else it's just ignorable whitespace. } } return; }, # 'Comment' => sub { }, # 'Proc' => sub { }, # 'Attlist' => sub { }, # 'Element' => sub { }, # 'Doctype' => sub { }, }); # Now actually process... $out = ""; if(ref($content) eq 'SCALAR') { $xml->parse($$content); } else { $xml->parsefile($content); } $out =~ s/^([^=])/=pod\n\n$1/; # make sure that we start with a =-thingie, one way or another. $out .= "=cut\n\n"; sanitize_newlines($out); return $out; } # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - { my %e = ('<' => 'E', '>' => 'E' ); sub pod_escape { #print STDERR "IN: <$_[0]>\n"; my $it = $_[0]; $it =~ s/([^\cm\cj\f\t !-;=?-~])/$Char2podent{ord $1} or "E<".ord($1).">"/eg; # Encode control chars, high bit chars and '<' and '>' #print STDERR "OUT: <$_[0]>\n\n"; return $it; } } ########################################################################### ########################################################################### 1;