Scraper/Plugin/ViewEvent.pm

package WWW::Mixi::Scraper::Plugin::ViewEvent;

use strict;
use warnings;
use WWW::Mixi::Scraper::Plugin;
use WWW::Mixi::Scraper::Utils qw( _uri _datetime );
use utf8;

validator {qw(
  id       is_number
  comm_id  is_number
  page     is_number_or_all
)};

sub scrape {
  my ($self, $html) = @_;

  my %scraper;
  $scraper{images} = scraper {
    process 'a',
      link => '@onClick';
    process 'a>img',
      thumb_link => '@src';
    result qw( link thumb_link );
  };

  $scraper{infos} = scraper {
    process 'dt',
      name => 'TEXT';
    process 'dd',
      string => 'TEXT';
    process 'dd>a',
      link    => '@href',
      subject => 'TEXT';
    result qw( name string link subject );
  };

  $scraper{topic} = scraper {
    process 'dl.bbsList01>dt>span.date',
      'time' => 'TEXT';
    process 'dl.bbsList01>dt[class="bbsTitle clearfix"]>span.titleSpan',
      'subject' => 'TEXT';
    process 'dd.bbsContent>dl>dt>a',
      'name'      => 'TEXT',
      'name_link' => '@href';
    process 'dd.bbsContent>dl>dt',
      'name_string' => 'TEXT',
    process 'dd.bbsContent>dl>dd',
      'description' => $self->html_or_text;
    process 'div.communityPhoto>table>tr>td',
      'images[]' => $scraper{images};
    process 'dl.bbsList01>dd.bbsInfo>dl',
      'infos[]' => $scraper{infos};
    result qw( time subject name_string name name_link images infos description );
  };

  $scraper{comment_body} = scraper {
    process 'dl.commentContent01>dt>a',
      'name_link' => '@href',
      'name'      => 'TEXT';
    process 'dl.commentContent01>dt',
      'name_string' => 'TEXT';
    process 'dl.commentContent01>dd',
      'description' => $self->html_or_text;
    process 'dl.commentContent01>dd>table>tr>td',
      'images[]' => $scraper{images};
    result qw( name_link name description images );
  };

  $scraper{comment} = scraper {
    process 'dl.commentList01>dt>span.date',
      'dates[]' => 'TEXT';
    process 'dl.commentList01>dt>span.senderId',
      'sender_ids[]' => 'TEXT';
    process 'dl.commentList01>dd',
      'comments[]' => $scraper{comment_body};
    result qw( dates comments sender_ids );
  };

  my $stash = $self->post_process($scraper{topic}->scrape(\$html))->[0];

  if ($stash->{name_string} && !$stash->{name}) {
    $stash->{name} = $stash->{name_string};
  }

  foreach my $item (@{ $stash->{infos} || [] }) {
    if ( $item->{name} eq '開催日時' ) {
      $stash->{date} = $item->{string};
    }
    if ( $item->{name} eq '募集期限' ) {
      $stash->{deadline} = $item->{string};
    }
    if ( $item->{name} eq '開催場所' ) {
      $stash->{location} = $item->{string};
    }
    if ( $item->{name} eq '参加者' ) {
      $stash->{list}->{count}   = $item->{string};
      $stash->{list}->{link}    = _uri( $item->{link} );
      $stash->{list}->{subject} = $item->{subject};
    }
  }

  # XXX: this fails when you test with local files.
  # However, this link cannot be extracted from the html,
  # at least as of writing this. ugh.
  $stash->{link} = $self->{uri};

  my $stash_c = $self->post_process($scraper{comment}->scrape(\$html))->[0];

  my @dates      = @{ $stash_c->{dates} || [] };
  my @sender_ids = @{ $stash_c->{sender_ids} || [] };
  my @comments   = @{ $stash_c->{comments} || [] };
  foreach my $comment ( @comments ) {
    $comment->{time}      = _datetime( shift @dates );
    $comment->{subject}   = shift @sender_ids;

    if (!$comment->{name}) {
      $comment->{name} = $comment->{name_string} || ' ';
    }

    # incompatible with WWW::Mixi to let comment links
    # look more 'permanent' to make plagger/rss readers happier
    $comment->{name_link} = _uri( $comment->{name_link} );
    $comment->{link}      = $stash->{link}
      ? _uri( $stash->{link} . '#' . $comment->{subject} )
      : undef;

    if ( $comment->{images} ) {
      foreach my $image ( @{ $comment->{images} || [] } ) {
        $image->{link}       = _uri( $image->{link} );
        $image->{thumb_link} = _uri( $image->{thumb_link} );
      }
    }
  }

  $stash->{comments} = \@comments;

  return $stash;
}

1;

__END__

=head1 NAME

WWW::Mixi::Scraper::Plugin::ViewEvent

=head1 DESCRIPTION

This is almost equivalent to WWW::Mixi->parse_view_event().

=head1 METHOD

=head2 scrape

returns a hash reference such as

  {
    subject => 'title of the event',
    link => 'http://mixi.jp/view_event.pl?id=xxx',
    time => 'yyyy-mm-dd hh:mm',
    date => 'yyyy-mm-dd',
    deadline => 'sometime soon',
    location => 'somewhere',
    description => 'event description',
    name => 'who plans',
    name_link => 'http://mixi.jp/show_friend.pl?id=xxx',
    list => {
      count => '8人',
      link => 'http://mixi.jp/list_event_member.pl?id=xxx&comm_id=xxx',
      subject => '参加者一覧を見る',
    },
    comments => [
      {
        subject     => 1,
        name        => 'commenter',
        name_link   => 'http://mixi.jp/show_friend.pl?id=xxxx',
        link        => 'http://mixi.jp/view_event.pl?id=xxxx#1',
        time        => 'yyyy-mm-dd hh:mm',
        description => 'comment body',
      }
    ]
  }

=head1 AUTHOR

Kenichi Ishigaki, E<lt>ishigaki at cpan.orgE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2007 by Kenichi Ishigaki.

This program is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.

=cut