lib/Web/Microformats2/Parser.pm

package Web::Microformats2::Parser;

use Moo;
use Types::Standard qw(InstanceOf);
use HTML::TreeBuilder::XPath;
use HTML::Entities;
use v5.10;
use Scalar::Util qw(blessed);
use JSON;
use DateTime::Format::ISO8601;
use URI;
use Carp;

use Web::Microformats2::Item;
use Web::Microformats2::Document;

use Readonly;

has 'url_context' => (
    is => 'rw',
    isa => InstanceOf['URI'],
    coerce => sub { URI->new( $_[0] ) },
    lazy => 1,
    clearer => '_clear_url_context',
    default => sub { URI->new( 'http://example.com/' ) },
);

sub parse {
    my $self = shift;
    my ( $html, %args ) = @_;

    $self->_clear;
    if ( $args{ url_context } ) {
        $self->url_context( $args{url_context} );
    }

    my $tree = HTML::TreeBuilder::XPath->new;
    $tree->ignore_unknown( 0 );
    $tree->no_space_compacting( 1 );
    $tree->ignore_ignorable_whitespace( 0 );
    $tree->no_expand_entities( 1 );

    # Adding HTML5 elements because it's 2018.
    foreach (qw(article aside details figcaption figure footer header main mark nav section summary time)) {
        $HTML::TreeBuilder::isBodyElement{$_}=1;
    }

    $tree->parse( $html );

    if ( my $base_url = $tree->findvalue( './/base/@href' ) ) {
        $self->url_context( $base_url );
    }

    my $document = Web::Microformats2::Document->new;
    $self->analyze_element( $document, $tree );
    return $document;
}

# analyze_element: Recursive method that scans an element for new microformat
# definitions (h-*) or properties (u|dt|e|p-*) and then does the right thing.
# It also builds up the MF2 document's rels and rel-urls as it goes.
sub analyze_element {
    my $self = shift;
    my ( $document, $element, $current_item ) = @_;

    return unless blessed( $element) && $element->isa( 'HTML::Element' );

    $self->_add_element_rels_to_mf2_document( $element, $document );

    my $mf2_attrs = $self->_tease_out_mf2_attrs( $element );

    my $h_attrs = delete $mf2_attrs->{h};
    my $new_item;
    if ( $h_attrs->[0] ) {
        $new_item = Web::Microformats2::Item->new( {
            types => $h_attrs,
            parent => $current_item,
        } );
        $document->add_item( $new_item );
        unless ( $current_item ) {
            $document->add_top_level_item( $new_item );
        }
    }

    while (my ($mf2_type, $properties_ref ) = each( %$mf2_attrs ) ) {
        next unless $current_item;
        next unless @{ $properties_ref };
        if ( $mf2_type eq 'p' ) {
            # p-property:
            # A catch-all generic property to store on the current
            # MF2 item being defined.
            # (If this same element begins an h-* microformat, we don't parse
            # this p-* any further; instead we'll store the new item under
            # this property name.)
            unless ( $new_item ) {
                for my $property ( @$properties_ref ) {
                    my $value = $self->_parse_property_value( $element );
                    if ( defined $value ) {
                        $current_item->add_property(
                            "p-$property",
                            $value,
                        );
                    }
                }
            }
        }
        elsif ( $mf2_type eq 'u' ) {
            # u-property:
            # Look for a URL in child attributes, and store it as a property.

            # (But not if a new h-format has been defined, in which case we'll
            # just use the u-property's name to store it. Why would you do that
            # instead of using a p-property? I don't know, but the tests demand
            # it.)
            unless ( $new_item ) {
                for my $property ( @$properties_ref ) {
                    my $vcp_fragments_ref =
                        $self->_seek_value_class_pattern( $element );
                    if ( my $url = $self->_tease_out_url( $element ) ) {
                        $current_item->add_property( "u-$property", $url );
                    }
                    elsif ( @$vcp_fragments_ref ) {
                        $current_item->add_property(
                            "u-$property",
                            join q{}, @$vcp_fragments_ref,
                        )
                    }
                    elsif ( $url = $self->_tease_out_unlikely_url($element)) {
                        $current_item->add_property( "u-$property", $url );
                    }
                    else {
                        $current_item->add_property(
                            "u-$property",
                            _trim( $element->as_text ),
                        );
                    }
                }
            }
        }
        elsif ( $mf2_type eq 'e' ) {
            # e-property:
            # Create a struct with keys "html" and "value", and then
            # store this in a new property.
            for my $property ( @$properties_ref ) {
                my %e_data;
                for my $content_piece ( $element->content_list ) {

                    # Make sure all URLs found in certain HTML attrs are
                    # absolute.
                    if ( ref $content_piece ) {
                        # XXX This is probably a bit too loose about what tags
                        #     these attrs can appear on.
                        for my $href_element ( $content_piece, $content_piece->findnodes('.//*[@href|@src]') ) {
                            foreach ( qw( href src ) ) {
                                my $url = $href_element->attr($_);
                                if ( $url ) {
                                    my $abs_url = URI->new_abs( $url, $self->url_context)->as_string;
                                    $href_element->attr( $_=> $abs_url );
                                }
                            }
                        }
                        $e_data{html} .= $content_piece->as_HTML( '<>&', undef, {} );

                    }
                    else {

                        $e_data{html} .= $content_piece;
                    }
                }
                $e_data{ value } = _trim (decode_entities( $element->as_text) );

                # The official tests specifically trim space-glyphs per se;
                # all other trailing whitespace stays. Shrug.
                $e_data{ html } =~ s/ +$//;

                $current_item->add_property( "e-$property", \%e_data );
            }
        }
        elsif ( $mf2_type eq 'dt' ) {
            # dt-property:
            # Read a child attribute as an ISO-8601 date-time string.
            # Store it as a property in the MF2 date-time representation format.
            for my $property ( @$properties_ref ) {
                my $dt_string;
                my $vcp_fragments_ref =
                    $self->_seek_value_class_pattern( $element );
                if ( @$vcp_fragments_ref ) {
                    $dt_string = $self->_format_datetime(join (q{T}, @$vcp_fragments_ref), $current_item);
                }
                elsif ( my $alt = $element->findvalue( './@datetime|@title|@value' ) ) {
                    $dt_string = $alt;
                }
                elsif ( my $text = $element->as_trimmed_text ) {
                    $dt_string = $text;
                }
                if ( defined $dt_string ) {
                    $current_item->add_property(
                        "dt-$property",
                        $dt_string,
                    );
                }
            }
        }
    }

    if ( $new_item ) {
        for my $child_element ( $element->content_list ) {
            $self->analyze_element( $document, $child_element, $new_item );
        }

        # Now that the new item's been recursively scanned, perform
        # some post-processing.
        # First, add any implied properties.
        for my $impliable_property (qw(name photo url)) {
             unless ( $new_item->has_property( $impliable_property ) ) {
                 my $method = "_set_implied_$impliable_property";
                $self->$method( $new_item, $element );
            }
        }

        # Put this onto the parent item's property-list, or its children-list,
        # depending on context.
        my @item_properties;
        for my $prefix (qw( u p ) ) {
            push @item_properties, map { "$prefix-$_" } @{ $mf2_attrs->{$prefix} };
        }
        if ( $current_item && @item_properties ) {
            for my $item_property ( @item_properties ) {
                # We place a clone of the new item into the current item's
                # property list, rather than the item itself. This allows for
                # edge cases where the same item needs to go under multiple
                # properties, but carry different 'value' attributes.
                my $cloned_new_item =
                    bless { %$new_item }, ref $new_item;

                $current_item
                    ->add_property( "$item_property", $cloned_new_item );

                # Now add a "value" attribute to this new item, if appropriate,
                # according to the MF2 spec.
                my $value_attribute;
                if ( $item_property =~ /^p-/ ) {
                    if ( my $name = $new_item->get_properties('name')->[0] ) {
                        $value_attribute = $name;
                    }
                    else {
                        $value_attribute =
                            $self->_parse_property_value( $element );
                    }
                }
                elsif ( $item_property =~ /^u-/ ) {
                    $value_attribute = $new_item->get_properties('url')->[0];
                }

                $cloned_new_item->value( $value_attribute ) if defined ($value_attribute);
            }
        }
        elsif ($current_item) {
            $current_item->add_child ( $new_item );
        }

    }
    else {
        for my $child_element ( $element->content_list ) {
            $self->analyze_element( $document, $child_element, $current_item );
        }
    }
}

sub _tease_out_mf2_attrs {
    my $self = shift;
    my ( $element ) = @_;

    my %mf2_attrs;
    foreach ( qw( h e u dt p ) ) {
        $mf2_attrs{ $_ } = [];
    }

    my $class_attr = $element->attr('class');
    if ( $class_attr ) {
        while ($class_attr =~ /\b(h|e|u|dt|p)-([a-z]+(\-[a-z]+)*)($|\s)/g ) {
            my $mf2_type = $1;
            my $mf2_attr = $2;

            push @{ $mf2_attrs{ $mf2_type } }, $mf2_attr;
        }
    }

    return \%mf2_attrs;
}

sub _tease_out_url {
    my $self = shift;
    my ( $element ) = @_;

    my $xpath;
    my $url;
    if ( $element->tag =~ /^(a|area|link)$/ ) {
        $xpath = './@href';
    }
    elsif ( $element->tag =~ /^(img|audio)$/ ) {
        $xpath = './@src';
    }
    elsif ( $element->tag eq 'video' ) {
        $xpath = './@src|@poster';
    }
    elsif ( $element->tag eq 'object' ) {
        $xpath = './@data';
    }

    if ( $xpath ) {
        $url = $element->findvalue( $xpath );
    }

    if ( defined $url ) {
        $url = URI->new_abs( $url, $self->url_context )->as_string;
    }

    return $url;
}

sub _tease_out_unlikely_url {
    my $self = shift;
    my ( $element ) = @_;

    my $xpath;
    my $url;
    if ( $element->tag eq 'abbr' ) {
        $xpath = './@title';
    }
    elsif ( $element->tag =~ /^(data|input)$/ ) {
        $xpath = './@value';
    }

    if ( $xpath ) {
        $url = $element->findvalue( $xpath );
    }

    return $url;
}

sub _set_implied_name {
    my $self = shift;
    my ( $item, $element ) = @_;

    my $types = $item->types;

    return if $item->has_properties || $item->has_children;

    my $xpath;
    my $name;
    my $kid;
    my $accept_if_empty = 1; # If true, then null-string names are okay.
    if ( $element->tag =~ /^(img|area)$/ ) {
        $xpath = './@alt';
    }
    elsif ( $element->tag eq 'abbr' ) {
        $xpath = './@title';
    }
    elsif (
        ( $kid = $self->_non_h_unique_child( $element, 'img' ) )
        || ( $kid = $self->_non_h_unique_child( $element, 'area' ) )
    ) {
        $xpath = './@alt';
        $accept_if_empty = 0;
    }
    elsif ( $kid = $self->_non_h_unique_child( $element, 'abbr' ) ) {
        $xpath = './@title';
        $accept_if_empty = 0;
    }
    elsif (
        ( $kid = $self->_non_h_unique_grandchild( $element, 'img' ) )
        || ( $kid = $self->_non_h_unique_grandchild( $element, 'area' ) )
    ) {
        $xpath = './@alt';
        $accept_if_empty = 0;
    }
    elsif ( $kid = $self->_non_h_unique_grandchild( $element, 'abbr' ) ) {
        $xpath = './@title';
        $accept_if_empty = 0;
    }

    my $foo = $kid || $element;

    if ( $xpath ) {
        my $element_to_check = $kid || $element;
        my $value = $element_to_check->findvalue( $xpath );
         if ( ( $value ne q{} ) || $accept_if_empty ) {
            $name = $value;
         }
    }

    unless ( defined $name ) {
        $name = _trim( $element->as_text );
    }

    if ( length $name > 0 ) {
         $item->add_property( 'p-name', $name );
    }

}

sub _set_implied_photo {
    my $self = shift;
    my ( $item, $element ) = @_;

    my $xpath;
    my $url;
    my $kid;

    if ( $element->tag eq 'img' ) {
        $xpath = './@src';
    }
    elsif ( $element->tag eq 'object' ) {
        $xpath = './@data';
    }
    elsif ( $kid = $self->_non_h_unique_child( $element, 'img' ) ) {
        $xpath = './@src';
        $element = $kid;
    }
    elsif ( $kid = $self->_non_h_unique_child( $element, 'object' ) ) {
        $xpath = './@data';
        $element = $kid;
    }
    elsif ( $kid = $self->_non_h_unique_grandchild( $element, 'img' ) ) {
        $xpath = './@src';
        $element = $kid;
    }
    elsif ( $kid = $self->_non_h_unique_grandchild( $element, 'object' ) ) {
        $xpath = './@data';
        $element = $kid;
    }

    if ( $xpath ) {
        $url = $element->findvalue( $xpath );
    }

    if ( defined $url ) {
        $url = URI->new_abs( $url, $self->url_context )->as_string;
        $item->add_property( 'u-photo', $url );
    }

}

sub _set_implied_url {
    my $self = shift;
    my ( $item, $element ) = @_;

    my $xpath;
    my $url;

    my $kid;
    if ( $element->tag =~ /^(a|area)$/ ) {
        $xpath = './@href';
    }
    elsif (
        ( $kid = $self->_non_h_unique_child( $element, 'a' ) )
        || ( $kid = $self->_non_h_unique_child( $element, 'area' ) )
        || ( $kid = $self->_non_h_unique_grandchild( $element, 'a' ) )
        || ( $kid = $self->_non_h_unique_grandchild( $element, 'area' ) )
    ) {
        $xpath = './@href';
        $element = $kid;
    }

    if ( $xpath ) {
        $url = $element->findvalue( $xpath );
    }

    if ( defined $url ) {
        $url = URI->new_abs( $url, $self->url_context )->as_string;
        $item->add_property( 'u-url', $url );
    }

}

sub _non_h_unique_child {
    my $self = shift;
    my ( $element, $tag ) = @_;

    my @children = grep { (ref $_) && $_->tag eq $tag  } $element->content_list;

    if ( @children == 1 ) {
        my $mf2_attrs = $self->_tease_out_mf2_attrs( $children[0] );
        if (not ( $mf2_attrs->{h}->[0] ) ) {
            return $children[0];
        }
    }

    return;
}

sub _non_h_unique_grandchild {
    my $self = shift;
    my ( $element, $tag ) = @_;

    my @children = grep { ref $_ } $element->content_list;

    if ( @children == 1 ) {
        my $mf2_attrs = $self->_tease_out_mf2_attrs( $children[0] );
        if (not ( $mf2_attrs->{h}->[0] ) ) {
            return $self->_non_h_unique_child( $children[0], $tag );
        }
    }

    return;
}

sub _clear {
    my $self = shift;

    $self->_clear_url_context;
}

sub _seek_value_class_pattern {
    my $self = shift;

    my ( $element, $vcp_fragments_ref ) = @_;

    $vcp_fragments_ref ||= [];

    my $class = $element->attr( 'class' );
    if ( $class && $class =~ /\bvalue(-title)?\b/ ) {
        if ( $1 ) {
            push @$vcp_fragments_ref, $element->attr( 'title' );
        }
        elsif ( ( $element->tag =~ /^(del|ins|time)$/ ) && defined( $element->attr('datetime'))) {
            push @$vcp_fragments_ref, $element->attr('datetime');
        }
        else {
            my $html;
            for my $content_piece ( $element->content_list ) {
                if ( ref $content_piece ) {
                    $html .= $content_piece->as_HTML;
                }
                else {
                    $html .= $content_piece;
                }
            }
            push @$vcp_fragments_ref, $html;
        }
    }
    else {
        for my $child_element ( grep { ref $_ } $element->content_list ) {
            $self->_seek_value_class_pattern(
                $child_element, $vcp_fragments_ref
            );
        }
    }

    return $vcp_fragments_ref;
}

sub _trim {
    my ($string) = @_;
    $string =~ s/^\s+//;
    $string =~ s/\s+$//;
    return $string;
}

sub _format_datetime {
    my ($self, $dt_string, $current_item) = @_;

    my $dt;

    # Knock off leading/trailing whitespace.
    $dt_string = _trim($dt_string);

    $dt_string =~ s/t/T/;

    # Note presence of AM/PM, but toss it out of the string.
    $dt_string =~ s/((?:a|p)\.?m\.?)//i;
    my $am_or_pm = $1 || '';

    # Store the provided TZ offset.
    my ($provided_offset) = $dt_string =~ /([\-\+Z](?:\d\d:?\d\d)?)$/;
    $provided_offset ||= '';

    # Reformat HHMM offset as HH:MM.
    $dt_string =~ s/(-|\+)(\d\d)(\d\d)/$1$2:$3/;

    # Store the provided seconds.
    my ($seconds) = $dt_string =~ /\d\d:\d\d:(\d\d)/;
    $seconds = '' unless defined $seconds;

    # Insert :00 seconds on time when paired with a TZ offset.
    $dt_string =~ s/T(\d\d:\d\d)([\-\+Z])/T$1:00$2/;
    $dt_string =~ s/^(\d\d:\d\d)([\-\+Z])/$1:00$2/;

    # Zero-pad hours when only a single-digit hour appears.
    $dt_string =~ s/T(\d)$/T0$1/;
    $dt_string =~ s/T(\d):/T0$1:/;

    # Insert :00 minutes on time when only an hour is listed.
    $dt_string =~ s/T(\d\d)$/T$1:00/;

    # Treat a space separator between date & time as a 'T'.
    $dt_string =~ s/ /T/;

    # If this is a time with no date, try to apply a previously-seen
    # date to it.
    my $date_is_defined = 1;
    if ( $dt_string =~ /^\d\d:/ ) {
        if ( my $previous_dt = $current_item->last_seen_date ) {
            $dt_string = $previous_dt->ymd . "T$dt_string";
        }
        else {
            $date_is_defined = 0;
            carp "Encountered a value-class datetime with only a time, "
                 . "no date, and no date defined earlier. Results may "
                 . "not be what you expect. (Data: $dt_string)";
        }
    }

    eval {
    $dt = DateTime::Format::ISO8601->new
              ->parse_datetime( $dt_string );
    };

    return if $@;

    if ($date_is_defined) {
        $current_item->last_seen_date( $dt );
    }

    if ($am_or_pm =~ /^[pP]/) {
        # There was a 'pm' specified, so add 12 hours.
        $dt->add( hours => 12 );
    }

    my $format;
    if ( ($dt_string =~ /-/) && ($dt_string =~ /[ T]/) ) {
        my $offset;
        if ($provided_offset eq 'Z') {
            $offset = 'Z';
        }
        elsif ($provided_offset) {
            $offset = '%z';
        }
        else {
            $offset = '';
        }
        $seconds = ":$seconds" if length $seconds;
        $format = "%Y-%m-%d %H:%M$seconds$offset";
    }
    elsif ( $dt_string =~ /-/ ) {
        $format = '%Y-%m-%d';
    }

    return $dt->strftime( $format );
}

sub _parse_property_value {
    my ( $self, $element ) = @_;

    my $value;

    my $vcp_fragments_ref =
        $self->_seek_value_class_pattern( $element );
    if ( @$vcp_fragments_ref ) {
        $value = join q{}, @$vcp_fragments_ref;
    }
    elsif ( my $alt = $element->findvalue( './@title|@value|@alt' ) ) {
        $value = $alt;
    }
    elsif ( my $text = _trim( decode_entities($element->as_text) ) ) {
        $value = $text;
    }

    return $value;
}

sub _add_element_rels_to_mf2_document {
    my ( $self, $element, $document ) = @_;

    return unless $element->tag =~ /^(a|link)$/;

    my $rel = $element->attr( 'rel' );
    return unless defined $rel;

    my $href = $element->attr( 'href' );
    my $url = URI->new_abs( $href, $self->url_context)->as_string;

    my @rels = split /\s+/, $rel;
    for my $rel ( @rels ) {
        $document->add_rel( $rel, $url );
    }

    my $rel_url_value = {};
    foreach (qw( hreflang media title type ) ) {
        next if defined $rel_url_value->{ $_ };
        my $value = $element->attr( $_ );
        if ( defined $value ) {
            $rel_url_value->{ $_ } = $value;
        }
    }
    my $text = ($element->as_text);
    if ( defined $text ) {
        $rel_url_value->{ text } = $text;
    }

    $rel_url_value->{ rels } = \@rels;

    $document->add_rel_url( $url, $rel_url_value );

}

1;

=pod

=head1 NAME

Web::Microformats2::Parser - Read Microformats2 information from HTML

=head1 DESCRIPTION

An object of this class represents a Microformats2 parser.

See L<Web::Microformats2> for further context and purpose.

=head1 METHODS

=head2 Class Methods

=head3 new

 $parser = Web::Microformats2::Parser->new;

Returns a parser object.

=head2 Object Methods

=head3 parse

 $doc = $parser->parse( $html, %args );

Pass in a string containing HTML which itself contains Microformats2
metadata, and receive a L<Web::Microformats2::Document> object in return.

The optional args hash recognizes the following keys:

=over

=item url_context

A L<URI> object or URI-shaped string that will be used as a context for
transforming all relative URL properties encountered within MF2 tags
into absolute URLs.

The default value is C<http://example.com>, so you'll probably want to
set this to something more interesting, such as the absolute URL of the
HTML that we are parsing.

=back

=head1 AUTHOR

Jason McIntosh ([email protected])

=head1 COPYRIGHT AND LICENSE

This software is Copyright (c) 2018 by Jason McIntosh.

This is free software, licensed under:

  The MIT (X11) License
	Global
`s`	Focus search bar
`?`	Bring up this help dialog
	GitHub
`g` `p`	Go to pull requests
`g` `i`	go to github issues (only if github is preferred repository)
	POD
`g` `a`	Go to author
`g` `c`	Go to changes
`g` `i`	Go to issues
`g` `d`	Go to dist
`g` `r`	Go to repository/SCM
`g` `s`	Go to source
`g` `b`	Go to file browse
	Search terms
module: (e.g. module:Plugin)
distribution: (e.g. distribution:Dancer auth)
author: (e.g. author:SONGMU Redis)
version: (e.g. version:1.00)