jikamens · December 11, 2015 16:08
diff --git a/hn-comment-feed.cgi b/hn-comment-feed.cgi
 #!/usr/bin/perl

 # CGI script for generating an RSS feed of the comment on a Hacker
 # News posting.
 #
 # Copyright (c) 2013 Jonathan Kamens <[email protected]>.
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # This program is distributed in the hope that it will be useful, but
 # WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 # General Public License for more details.
 #
 # See <http://www.gnu.org/licenses/>.
 #
 # Published at <https://gist.github.com/4625596>.


 use strict;
 use warnings;

 use CGI ':standard';
 use CGI::Carp 'fatalsToBrowser';
 use Data::Dumper;
 # Alas, the server I'm using this on has only perl 5.8. If you have
 # the newer Date::Manip::Date, you can update the script to use it by
 # searching for "DMD" and making the appropriate edits.
 # DMD
 # use Date::Manip::Date;
 use Date::Manip;
 use HTML::TreeBuilder;
 use LWP::UserAgent;
 use XML::RSS;

 my $base = 'https://news.ycombinator.com';
 my $title_limit = 80;
 # Set this to undef if you don't want to cache dates. Caching dates
 # is necessary because the dates we get back from HN are vague, e.g.,
 # "4 hours ago".
 my $date_cache_file = '/var/www/tmp/hn-comment-feed.cache';
 my $date_cache_timeout = 60 * 60 * 24 * 31; # 1 month

 my $now = time();

 my %date_cache;
 &read_date_cache();

 my $id = param('id');
 die "No post id specified\n" if (! $id);
 die "Bad id '$id'\n" if ($id !~ /^\d+$/);

 my $ua = LWP::UserAgent->new();

 my $url = "$base/item?id=$id";
 my $response = $ua->get($url);
 die "Failed to fetch $url\n" if (! $response->is_success);

 my $tb = HTML::TreeBuilder->new();
 $tb->parse_content($response->decoded_content());

 my @comments;

 my $title = $tb->find_by_tag_name('title')->as_text();
 die "No title\n" if (! $title);
 $title = "Comment feed for: $title";

 my(@elements) = $tb->descendants();
 foreach my $element (@elements) {
    my $class = $element->attr('class');
    if ($class and $class eq 'comhead') {
 	while (@comments and ! $comments[-1]{'comment'}) {
 	    pop @comments;
 	}
 	push(@comments, {'comhead' => $element});
    }
    if ($class and $class eq 'comment') {
 	next if (! @comments);
 	$comments[-1]{'comment'} = $element;
    }
 }

 # If there are no comments yet, there will be one comhead with no
 # associated comment (because the HN formatting asininely uses the
 # "comhead" class for both the site name in the post header and for
 # comment headers. Hey, HN maintainers, learn how to use CSS properly!
 if (@comments == 1 and ! $comments[0]->{'comment'}) {
    @comments = ();
 }

 foreach my $comment (@comments) {
    @elements = $comment->{'comhead'}->content_list();
    # Structure of comhead is link to user, post time, link to comment
    die "Bad comhead\n" if (@elements != 3);

    my($author_tag) = $elements[0];
    my($age) = $elements[1];
    my($comment_link) = $elements[2];

    my($author_link) = $author_tag->attr('href');
    die "Bad author link\n" if (! $author_link);
    $author_link = "$base/$author_link";

    @elements = $author_tag->content_list();
    die "Bad author tag\n" if (@elements != 1);
    my($author_html) = ref $elements[0] ? $elements[0]->as_HTML() : $elements[0];
    my($author_name) = ref $elements[0] ? $elements[0]->as_text() : $elements[0];

    $comment_link = $comment_link->attr('href');
    die "Bad comment link\n" if (! $comment_link);
    $comment_link = "$base/$comment_link";

    die "Bad comment age\n" if (ref $age);
    die "Bad comment age\n" if ($age !~ s/\s*\|\s*$//);
    # DMD
    # my $date = Date::Manip::Date->new($age);
    # die "Bad comment age\n" if ($date->err);
    # $date->convert('UTC');
    # $date = $date->printf('%O+00:00');
    my $date = ParseDate($age);
    die "Bad comment age\n" if (! $date);
    $date = Date_ConvTZ($date, '', 'UTC');
    $date = UnixDate($date, '%O+00:00');

    $date = &get_date_cache($comment_link, $date);

    @elements = $comment->{'comment'}->content_list();
    my $content = '';
    for (@elements) {
 	if (ref) {
 	    $content .= $_->as_HTML();
 	}
 	else {
 	    $content .= $_;
 	}
    }

    my($title) = $comment->{'comment'}->as_text();
    if (length($title) > $title_limit) {
 	$title = substr($title, 0, $title_limit-3) . '...';
    }

    $comment->{'author_link'} = $author_link;
    $comment->{'author_html'} = $author_html;
    $comment->{'author_name'} = $author_name;
    $comment->{'age'} = $age;
    $comment->{'date'} = $date;
    $comment->{'comment_link'} = $comment_link;
    $comment->{'content'} = $content;
    $comment->{'title'} = $title;
 }

 &write_date_cache();

 @comments = sort { $b->{'date'} cmp $a->{'date'} } @comments;

 my $rss = XML::RSS->new();

 $rss->channel(title => $title, link => $url);

 for (@comments) {
    my $creator_blob = "<a href='$_->{author_link}'>$_->{author_html}</a>";

    $rss->add_item
 	(
 	 title       => $_->{'title'},
 	 link        => $_->{'comment_link'},
 	 description => $_->{'content'} . "<p>(by $creator_blob, $_->{'age'})</p>",
 	 dc => {
 	     date    => $_->{'date'},
 	     creator => $_->{'author_name'},
 	 });
 }

 print header('application/rss+xml');
 print $rss->as_string; 

 $tb->delete();

 sub read_date_cache {
    return if (! $date_cache_file);
    return if (! -f $date_cache_file);
    open(CACHE ,'<', $date_cache_file) or die "Error reading date cache\n";
    while (<CACHE>) {
 	chomp;
 	my($url, $date, $last_used) = split;
 	$date_cache{$url} = {
 	    date => $date,
 	    last_used => $last_used
 	    };
    }
    close(CACHE) or die "Error reading date cache\n";
 }

 sub get_date_cache {
    my($url, $date) = @_;
    $date_cache{$url}{'last_used'} = $now;
    if ($date_cache{$url}{'date'}) {
 	return $date_cache{$url}{'date'};
    }
    else {
 	return $date_cache{$url}{'date'} = $date;
    }
 }

 sub write_date_cache {
    return if (! $date_cache_file);
    my $new = "$date_cache_file.new";
    open(CACHE, '>', $new) or die "Error writing date cache\n";
    foreach  my $url (keys %date_cache) {
 	next if ($now - $date_cache{$url}->{'last_used'} > $date_cache_timeout);
 	print(CACHE "$url $date_cache{$url}{'date'} $date_cache{$url}{'last_used'}\n")
 	    or die "Error writing date cache\n";
    }
    close(CACHE) or die "Error writing date cache\n";
    rename($new, $date_cache_file) or die "Error writing date cache\n";
 }

 # CHANGES
 #
 # 2013/02/01 - Don't barf on postings with no comments.
 # 2013/01/24 - Creator name in the XML should not contain HTML.
 # 2013/01/24 - Creator name in the XML should not be a link.
 # 2013/01/24 - Initial release.
	#!/usr/bin/perl

	# CGI script for generating an RSS feed of the comment on a Hacker
	# News posting.
	#
	# Copyright (c) 2013 Jonathan Kamens <[email protected]>.
	#
	# This program is free software: you can redistribute it and/or modify
	# it under the terms of the GNU General Public License as published by
	# the Free Software Foundation, either version 3 of the License, or
	# (at your option) any later version.
	#
	# This program is distributed in the hope that it will be useful, but
	# WITHOUT ANY WARRANTY; without even the implied warranty of
	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	# General Public License for more details.
	#
	# See <http://www.gnu.org/licenses/>.
	#
	# Published at <https://gist.github.com/4625596>.


	use strict;
	use warnings;

	use CGI ':standard';
	use CGI::Carp 'fatalsToBrowser';
	use Data::Dumper;
	# Alas, the server I'm using this on has only perl 5.8. If you have
	# the newer Date::Manip::Date, you can update the script to use it by
	# searching for "DMD" and making the appropriate edits.
	# DMD
	# use Date::Manip::Date;
	use Date::Manip;
	use HTML::TreeBuilder;
	use LWP::UserAgent;
	use XML::RSS;

	my $base = 'https://news.ycombinator.com';
	my $title_limit = 80;
	# Set this to undef if you don't want to cache dates. Caching dates
	# is necessary because the dates we get back from HN are vague, e.g.,
	# "4 hours ago".
	my $date_cache_file = '/var/www/tmp/hn-comment-feed.cache';
	my $date_cache_timeout = 60 * 60 * 24 * 31; # 1 month

	my $now = time();

	my %date_cache;
	&read_date_cache();

	my $id = param('id');
	die "No post id specified\n" if (! $id);
	die "Bad id '$id'\n" if ($id !~ /^\d+$/);

	my $ua = LWP::UserAgent->new();

	my $url = "$base/item?id=$id";
	my $response = $ua->get($url);
	die "Failed to fetch $url\n" if (! $response->is_success);

	my $tb = HTML::TreeBuilder->new();
	$tb->parse_content($response->decoded_content());

	my @comments;

	my $title = $tb->find_by_tag_name('title')->as_text();
	die "No title\n" if (! $title);
	$title = "Comment feed for: $title";

	my(@elements) = $tb->descendants();
	foreach my $element (@elements) {
	my $class = $element->attr('class');
	if ($class and $class eq 'comhead') {
	while (@comments and ! $comments[-1]{'comment'}) {
	pop @comments;
	}
	push(@comments, {'comhead' => $element});
	}
	if ($class and $class eq 'comment') {
	next if (! @comments);
	$comments[-1]{'comment'} = $element;
	}
	}

	# If there are no comments yet, there will be one comhead with no
	# associated comment (because the HN formatting asininely uses the
	# "comhead" class for both the site name in the post header and for
	# comment headers. Hey, HN maintainers, learn how to use CSS properly!
	if (@comments == 1 and ! $comments[0]->{'comment'}) {
	@comments = ();
	}

	foreach my $comment (@comments) {
	@elements = $comment->{'comhead'}->content_list();
	# Structure of comhead is link to user, post time, link to comment
	die "Bad comhead\n" if (@elements != 3);

	my($author_tag) = $elements[0];
	my($age) = $elements[1];
	my($comment_link) = $elements[2];

	my($author_link) = $author_tag->attr('href');
	die "Bad author link\n" if (! $author_link);
	$author_link = "$base/$author_link";

	@elements = $author_tag->content_list();
	die "Bad author tag\n" if (@elements != 1);
	my($author_html) = ref $elements[0] ? $elements[0]->as_HTML() : $elements[0];
	my($author_name) = ref $elements[0] ? $elements[0]->as_text() : $elements[0];

	$comment_link = $comment_link->attr('href');
	die "Bad comment link\n" if (! $comment_link);
	$comment_link = "$base/$comment_link";

	die "Bad comment age\n" if (ref $age);
	die "Bad comment age\n" if ($age !~ s/\s\\|\s$//);
	# DMD
	# my $date = Date::Manip::Date->new($age);
	# die "Bad comment age\n" if ($date->err);
	# $date->convert('UTC');
	# $date = $date->printf('%O+00:00');
	my $date = ParseDate($age);
	die "Bad comment age\n" if (! $date);
	$date = Date_ConvTZ($date, '', 'UTC');
	$date = UnixDate($date, '%O+00:00');

	$date = &get_date_cache($comment_link, $date);

	@elements = $comment->{'comment'}->content_list();
	my $content = '';
	for (@elements) {
	if (ref) {
	$content .= $_->as_HTML();
	}
	else {
	$content .= $_;
	}
	}

	my($title) = $comment->{'comment'}->as_text();
	if (length($title) > $title_limit) {
	$title = substr($title, 0, $title_limit-3) . '...';
	}

	$comment->{'author_link'} = $author_link;
	$comment->{'author_html'} = $author_html;
	$comment->{'author_name'} = $author_name;
	$comment->{'age'} = $age;
	$comment->{'date'} = $date;
	$comment->{'comment_link'} = $comment_link;
	$comment->{'content'} = $content;
	$comment->{'title'} = $title;
	}

	&write_date_cache();

	@comments = sort { $b->{'date'} cmp $a->{'date'} } @comments;

	my $rss = XML::RSS->new();

	$rss->channel(title => $title, link => $url);

	for (@comments) {
	my $creator_blob = "<a href='$_->{author_link}'>$_->{author_html}</a>";

	$rss->add_item
	(
	title => $_->{'title'},
	link => $_->{'comment_link'},
	description => $_->{'content'} . "<p>(by $creator_blob, $_->{'age'})</p>",
	dc => {
	date => $_->{'date'},
	creator => $_->{'author_name'},
	});
	}

	print header('application/rss+xml');
	print $rss->as_string;

	$tb->delete();

	sub read_date_cache {
	return if (! $date_cache_file);
	return if (! -f $date_cache_file);
	open(CACHE ,'<', $date_cache_file) or die "Error reading date cache\n";
	while (<CACHE>) {
	chomp;
	my($url, $date, $last_used) = split;
	$date_cache{$url} = {
	date => $date,
	last_used => $last_used
	};
	}
	close(CACHE) or die "Error reading date cache\n";
	}

	sub get_date_cache {
	my($url, $date) = @_;
	$date_cache{$url}{'last_used'} = $now;
	if ($date_cache{$url}{'date'}) {
	return $date_cache{$url}{'date'};
	}
	else {
	return $date_cache{$url}{'date'} = $date;
	}
	}

	sub write_date_cache {
	return if (! $date_cache_file);
	my $new = "$date_cache_file.new";
	open(CACHE, '>', $new) or die "Error writing date cache\n";
	foreach my $url (keys %date_cache) {
	next if ($now - $date_cache{$url}->{'last_used'} > $date_cache_timeout);
	print(CACHE "$url $date_cache{$url}{'date'} $date_cache{$url}{'last_used'}\n")
	or die "Error writing date cache\n";
	}
	close(CACHE) or die "Error writing date cache\n";
	rename($new, $date_cache_file) or die "Error writing date cache\n";
	}

	# CHANGES
	#
	# 2013/02/01 - Don't barf on postings with no comments.
	# 2013/01/24 - Creator name in the XML should not contain HTML.
	# 2013/01/24 - Creator name in the XML should not be a link.
	# 2013/01/24 - Initial release.