#!/usr/bin/perl # Copyright (C) 2007 王亮 # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public Licence as published by # the Free Software Foundation; either version 2, or (at your option) # any later version. # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public Licence for more details. use strict; use warnings; use YAML::Syck; use HTML::Template; use DateTime; use DateTime::Format::W3CDTF; use XML::Feed; use LWP::UserAgent; use URI; use File::Path; use HTTP::Status; use Encode; use Getopt::Long; use Digest::SHA; use File::HomeDir; use File::Spec::Functions; use IO::File; use Scalar::Util qw(blessed); my $output = "subscription.html"; my $parse_only; my $remove_old_entries; my $max_feed_num; my $feed_file; my $feed_url; my $result = GetOptions ( "output=s" => \$output, "parse-only" => \$parse_only, "remove-old-entries" => \$remove_old_entries, "max-feed-num|f=i" => \$max_feed_num, "feed-file=s" => \$feed_file, "feed-url=s" => \$feed_url, ); die "Usage: perl print.pl [--output filename] [--parse-only] [--remove-old-entries] [[--max-feed-num=NUM]|[-f NUM]]\n" unless $result; my $feedman_home = catdir(File::HomeDir->my_home, ".feedman"); my $templatefile = catfile($feedman_home, "feedman.tmpl"); my $cachefile = catfile($feedman_home, "feed.yaml"); my $datefile = catfile($feedman_home, "last_update_date.yaml"); my $entryfile = catfile($feedman_home, "last_update_entries.yaml"); my $getnewfile = catfile($feedman_home, "getnew.yaml"); my %subscriptions; my %last_update_date; my %last_update_entries; my %always_get_new; my $now = DateTime->now(time_zone => 'Asia/Shanghai'); my $total_new_entries = 0; check_and_load(); main() unless caller(); sub main { if (defined $feed_file || defined $feed_url) { my $filename; my $last_update; if (defined $feed_file) { $filename = $feed_file; } else { my $s = get_subscription($feed_url); $filename = catfile(make_directory($s->{xmlUrl}), "feed"); $last_update = get_last_update_date($s->{htmlUrl}); } $filename = get_correct_feed_filename($filename); my $r = get_latest_update_date($filename, $last_update); if (defined $r) { my $date = $r->{date}; my $str = DateTime::Format::W3CDTF->format_datetime($date); print "latest update: $str\n"; } else { die "Date not found.\n"; } } else { my @htmlUrl = keys %subscriptions; my $items = list_items(\@htmlUrl); my $sorted_items = sort_items_by_date($items); generate_html_from_template(scalar @htmlUrl, $sorted_items); if (!(defined $parse_only) && !(defined $max_feed_num)) { &save_updated_info; } } } sub check_and_load { local $YAML::Syck::ImplicitUnicode = 1; die "$templatefile not found.\n" unless -e $templatefile and -f _; if (-e $cachefile and -f _) { my $data = LoadFile($cachefile); if (defined $max_feed_num) { if ($max_feed_num > 0) { for my $k (keys %$data) { $subscriptions{$k} = $data->{$k}; last if --$max_feed_num == 0; } } else { warn "invalid value($max_feed_num) from option --max-feed-num, load all feeds.\n"; } } else { %subscriptions = %$data; } } else { die "$cachefile not found.\n"; } if (-e $datefile and -f _) { my $data = LoadFile($datefile); %last_update_date = %$data; for my $key (keys %$data) { parse_last_update_date($key); } } else { warn "$datefile not found.\n"; } if (-e $entryfile and -f _) { my $data = LoadFile($entryfile); %last_update_entries = %$data; } else { warn "$entryfile not found.\n"; } if (-e $getnewfile and -f _) { my $date = LoadFile($getnewfile); %always_get_new = %$date; } else { warn "$getnewfile not found.\n"; } } sub get_last_update_date { my $url = shift; return unless exists $last_update_date{$url}; return $last_update_date{$url}; } sub set_last_update_date { my $url = shift; $last_update_date{$url} = shift; } sub remove_last_update_date { my $url = shift; delete $last_update_date{$url}; } sub format_last_update_date { my $url = shift; my $d = get_last_update_date($url); set_last_update_date($url, DateTime::Format::W3CDTF->format_datetime($d)); } sub parse_last_update_date { my $url = shift; my $d = get_last_update_date($url); set_last_update_date($url, DateTime::Format::W3CDTF->parse_datetime($d)); } sub get_correct_feed_filename { my $filename = shift; if ($filename =~ m/blog\.sina\.com\.cn/) { my $newfile = "$filename.new"; warn "$newfile not found.\n" unless -e $newfile and -f _; return $newfile; } else { return $filename; } } # mirror still works since original file doesn't change. sub correct_feed { my $filename = shift; my $newfile = get_correct_feed_filename($filename); if ($filename ne $newfile) { my $in = IO::File->new($filename, "r") or die "can not open $filename.\n"; my $out = IO::File->new("$newfile", "w") or die "can not open $newfile.\n"; while (my $line = <$in>) { chomp $line; $line =~ s{(.*?)GMT\+8()}{$1+0800$2}; print $out "$line\n"; } close $in; close $out; return $newfile; } return $filename; } sub get_subscription { my $url = shift; return $subscriptions{$url}; } sub get_new_always { my $url = shift; return exists $always_get_new{$url}; } sub get_digest_string { my ($url, $feed) = @_; if (exists $always_get_new{$url}->{$feed}) { # convert number to string by quoting. return "$always_get_new{$url}->{$feed}"; } else { return "N/A"; } } sub set_digest { my ($url, $feed, $digest) = @_; $always_get_new{$url}->{$feed} = $digest; } sub list_items { my $htmlUrl = shift; my $ua = LWP::UserAgent->new; $ua->timeout(30); my @items; for my $url (@$htmlUrl) { my $s = get_subscription($url); my $filename = catfile(make_directory($s->{xmlUrl}), "feed"); # list of available feeds my @feeds; if (exists $s->{feeds}) { @feeds = (ref $s->{feeds}) ? @{$s->{feeds}} : ($s->{feeds}); } unshift @feeds, $s->{xmlUrl}; # get update date and new entries of feed my $update; my @new_feed_entries; if (defined $parse_only) { $filename = get_correct_feed_filename($filename); # update date is never undefined when parsing only. $update = get_last_update_date($s->{htmlUrl}); if (defined $update) { @new_feed_entries = get_previous_entries($filename); } else { my $r = get_latest_update_date($filename); if (defined $r) { $update = $r->{date}; @new_feed_entries = @{$r->{entries}}; } } $update = $now unless defined $update; } else { my $changed = 0; if (get_new_always($url)) { $changed = get_new($ua, $filename, $url, @feeds); } else { $changed = update_feed($ua, $filename, @feeds); } # reduce times of parsing feed my $prev_update = get_last_update_date($s->{htmlUrl}); if ($changed != 0 || !defined $prev_update) { $filename = correct_feed($filename); my $r = get_latest_update_date($filename, $prev_update); if (defined $r) { $update = $r->{date}; @new_feed_entries = @{$r->{entries}}; } elsif (defined $remove_old_entries) { pop_previous_entries($filename); } } else { $update = $prev_update; $filename = get_correct_feed_filename($filename); if (defined $remove_old_entries) { @new_feed_entries = pop_previous_entries($filename); } else { @new_feed_entries = get_previous_entries($filename); } } } my $num_new_feed_entries = scalar @new_feed_entries; $total_new_entries += $num_new_feed_entries; my $feed_info = fill_template ($s, $update, $num_new_feed_entries, \@new_feed_entries); push @items, $feed_info; } return \@items; } sub sort_items_by_date { my $items = shift; my @na = grep { ! UNIVERSAL::isa($_->{FEED_UPDATE_DATE}, 'DateTime') } @$items; my @others = grep { UNIVERSAL::isa($_->{FEED_UPDATE_DATE}, 'DateTime') } @$items; my @sorted_items = sort { DateTime->compare($b->{FEED_UPDATE_DATE}, $a->{FEED_UPDATE_DATE}) } @others; push @sorted_items, @na; return \@sorted_items; } sub generate_html_from_template { my ($number, $items) = @_; my $t = HTML::Template->new(filename => $templatefile); $t->param(FEEDNUM => $number); $t->param(NUM_NEW_ENTRIES => $total_new_entries); $t->param(DATE => $now); $t->param(SUBSCRIPTION => $items); open HTML, ">$output"; print HTML encode_utf8($t->output); close HTML; } sub save_updated_info { for my $url (keys %last_update_date) { format_last_update_date($url); } DumpFile($datefile, \%last_update_date); DumpFile($entryfile, \%last_update_entries); DumpFile($getnewfile, \%always_get_new); } sub format_datetime_w3cdtf { my $entries = shift; my @result; for my $e (@$entries) { my %entry_info; for my $key (keys %$e) { if (blessed $e->{$key} && $e->{$key}->isa('DateTime')) { $entry_info{$key} = DateTime::Format::W3CDTF->format_datetime($e->{$key}); } else { $entry_info{$key} = $e->{$key}; } } push @result, \%entry_info; } return @result; } sub parse_datetime_from_w3cdtf { my $entries = shift; my @result; for my $e (@$entries) { my %entry_info; for my $key (keys %$e) { if ($key eq "issued" or $key eq "modified") { $entry_info{$key} = DateTime::Format::W3CDTF->parse_datetime($e->{$key}); } else { $entry_info{$key} = $e->{$key}; } } push @result, \%entry_info; } return @result; } sub get_previous_entries { my $filename = shift; return unless exists $last_update_entries{$filename}; return @{$last_update_entries{$filename}}; } sub pop_previous_entries { my $filename = shift; my @previous_entries = get_previous_entries($filename); if (exists $last_update_entries{$filename}) { delete $last_update_entries{$filename}; } return @previous_entries; } sub set_last_update_entries { my ($filename, @entries) = @_; $last_update_entries{$filename} = \@entries; } sub filter_out_entries_after { my ($entries, $to) = @_; return grep { DateTime->compare($_->{date}, $to) <= 0 } @$entries; } # get entries between date range ($from, $to] since date of entries # without date from feed will be assigned to $to. sub get_entries_between { my ($entries, $from, $to) = @_; my @new = grep { DateTime->compare($_->{date}, $from) == 1 } @$entries; return unless @new; return filter_out_entries_after(\@new, $to); } sub get_latest_update_date { my ($filename, $prev_update) = @_; my @entries; return unless -e $filename and -f _; print STDERR "parse $filename..."; my $feed; eval { $feed = XML::Feed->parse($filename); }; if ($@) { print STDERR "bad feed\n"; return; } print STDERR "done\n"; return unless defined $feed; for my $entry ($feed->entries) { my %info; $info{title} = $entry->title; $info{link} = (defined $entry->link) ? $entry->link : $feed->link; my $m = $entry->modified; if (defined $m) { $info{modified} = $m; $info{modified}->set_time_zone('Asia/Shanghai'); } $m = $entry->issued; my $last_entry_update = get_last_update_date($entry->link); $info{issued} = defined $m ? $m : defined $last_entry_update ? $last_entry_update : $now ; $info{issued}->set_time_zone('Asia/Shanghai'); push @entries, \%info; } return unless scalar @entries > 0; my $i; my @sort_candidates; for ($i = 0; $i <= $#entries; ++$i) { my $d; if (exists $entries[$i]->{modified}) { $d = $entries[$i]->{modified}; } elsif (exists $entries[$i]->{issued}) { $d = $entries[$i]->{issued}; } push @sort_candidates, { date => $d, index => $i }; } my @sorted_entries = sort { DateTime->compare($b->{date}, $a->{date}) } @sort_candidates; my @new_entries = (! defined $prev_update) ? filter_out_entries_after(\@sorted_entries, $now) : get_entries_between(\@sorted_entries, $prev_update, $now); my @prev_entries; if (defined $remove_old_entries) { @prev_entries = pop_previous_entries($filename); } else { @prev_entries = get_previous_entries($filename); } my @parsed_prev_entries = parse_datetime_from_w3cdtf(\@prev_entries); # feed is modified but no new entry. return { date => $prev_update, entries => \@parsed_prev_entries } unless @new_entries; my $update = $new_entries[0]->{date}; my @new_feed_entries = map { $entries[$_->{index}] } @new_entries; my @updated_entries = format_datetime_w3cdtf(\@new_feed_entries); push @new_feed_entries, @parsed_prev_entries; if (! defined $remove_old_entries) { push @updated_entries, @prev_entries; } set_last_update_entries($filename, @updated_entries); if (DateTime->compare($update, $now) == 0) { for (@new_entries) { my $index = $_->{index}; my $date = $_->{date}; my $link = $entries[$index]->{link}; set_last_update_date($link, $date); } } return { date => $update, entries => \@new_feed_entries }; } sub compute_hexdigest { my $filename = shift; my $sha = Digest::SHA->new(256); $sha->addfile($filename); return $sha->hexdigest; } sub get_new { my ($ua, $filename, $url, @feeds) = @_; for my $f (@feeds) { print STDERR "fetch $f ..."; my $response = $ua->get($f, ':content_file' => $filename); if ($response->is_success) { my $hexdigest = compute_hexdigest($filename); my $prevdigest = get_digest_string($url, $f); if ($prevdigest eq $hexdigest) { print STDERR "up to date\n"; return 0; } else { print STDERR "done\n"; set_digest($url, $f, $hexdigest); return 1; } } print STDERR "skipped\n"; } return 0; } sub update_feed { my ($ua, $filename, @feeds) = @_; for my $f (@feeds) { print STDERR "mirror $f ..."; my $response; eval { $response = $ua->mirror($f, $filename); }; if ($@) { print STDERR "failed\n"; next; } if ($response->code == RC_NOT_MODIFIED) { print STDERR "up to date\n"; return 0; } if ($response->is_success) { print STDERR "done\n"; return 1; } print STDERR "skipped\n"; } return 0; } sub fill_template { my ($s, $update, $num_new_feed_entries, $new_feed_entries) = @_; my %feed = ( LINK => $s->{htmlUrl}, TITLE => $s->{title}, DEFAULT_FEED => $s->{xmlUrl}, MORE => exists $s->{feeds}, HAS_NEW_ENTRIES => $num_new_feed_entries, ); if (exists $s->{feeds}) { my @others = (ref $s->{feeds}) ? @{$s->{feeds}} : ($s->{feeds}); my @backup_feeds; push @backup_feeds, { feed => $_ } for @others; $feed{FEEDS} = \@backup_feeds; } if (defined $update) { $feed{FEED_UPDATE_DATE} = $update; set_last_update_date($feed{LINK}, $update); } else { $feed{FEED_UPDATE_DATE} = "N/A"; remove_last_update_date($feed{LINK}); } if ($num_new_feed_entries > 0) { $feed{NEW_ENTRIES} = $new_feed_entries; } return \%feed; } sub make_directory { my $xmlUrl = shift; my $uri = URI->new($xmlUrl); my $domain = $uri->host; my @segments = $uri->path_segments; pop @segments if $segments[-1] eq ""; push @segments, $uri->query if defined $uri->query; my $path = catdir($feedman_home, $domain, @segments); unless (-e $path and -d _) { print STDERR "create path: $path\n"; mkpath($path, 0, 0700); } return $path; } 1;