Last active
December 11, 2015 16:08
-
-
Save jikamens/4625596 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/perl | |
# CGI script for generating an RSS feed of the comment on a Hacker | |
# News posting. | |
# | |
# Copyright (c) 2013 Jonathan Kamens <[email protected]>. | |
# | |
# This program is free software: you can redistribute it and/or modify | |
# it under the terms of the GNU General Public License as published by | |
# the Free Software Foundation, either version 3 of the License, or | |
# (at your option) any later version. | |
# | |
# This program is distributed in the hope that it will be useful, but | |
# WITHOUT ANY WARRANTY; without even the implied warranty of | |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
# General Public License for more details. | |
# | |
# See <http://www.gnu.org/licenses/>. | |
# | |
# Published at <https://gist.github.com/4625596>. | |
use strict; | |
use warnings; | |
use CGI ':standard'; | |
use CGI::Carp 'fatalsToBrowser'; | |
use Data::Dumper; | |
# Alas, the server I'm using this on has only perl 5.8. If you have | |
# the newer Date::Manip::Date, you can update the script to use it by | |
# searching for "DMD" and making the appropriate edits. | |
# DMD | |
# use Date::Manip::Date; | |
use Date::Manip; | |
use HTML::TreeBuilder; | |
use LWP::UserAgent; | |
use XML::RSS; | |
my $base = 'https://news.ycombinator.com'; | |
my $title_limit = 80; | |
# Set this to undef if you don't want to cache dates. Caching dates | |
# is necessary because the dates we get back from HN are vague, e.g., | |
# "4 hours ago". | |
my $date_cache_file = '/var/www/tmp/hn-comment-feed.cache'; | |
my $date_cache_timeout = 60 * 60 * 24 * 31; # 1 month | |
my $now = time(); | |
my %date_cache; | |
&read_date_cache(); | |
my $id = param('id'); | |
die "No post id specified\n" if (! $id); | |
die "Bad id '$id'\n" if ($id !~ /^\d+$/); | |
my $ua = LWP::UserAgent->new(); | |
my $url = "$base/item?id=$id"; | |
my $response = $ua->get($url); | |
die "Failed to fetch $url\n" if (! $response->is_success); | |
my $tb = HTML::TreeBuilder->new(); | |
$tb->parse_content($response->decoded_content()); | |
my @comments; | |
my $title = $tb->find_by_tag_name('title')->as_text(); | |
die "No title\n" if (! $title); | |
$title = "Comment feed for: $title"; | |
my(@elements) = $tb->descendants(); | |
foreach my $element (@elements) { | |
my $class = $element->attr('class'); | |
if ($class and $class eq 'comhead') { | |
while (@comments and ! $comments[-1]{'comment'}) { | |
pop @comments; | |
} | |
push(@comments, {'comhead' => $element}); | |
} | |
if ($class and $class eq 'comment') { | |
next if (! @comments); | |
$comments[-1]{'comment'} = $element; | |
} | |
} | |
# If there are no comments yet, there will be one comhead with no | |
# associated comment (because the HN formatting asininely uses the | |
# "comhead" class for both the site name in the post header and for | |
# comment headers. Hey, HN maintainers, learn how to use CSS properly! | |
if (@comments == 1 and ! $comments[0]->{'comment'}) { | |
@comments = (); | |
} | |
foreach my $comment (@comments) { | |
@elements = $comment->{'comhead'}->content_list(); | |
# Structure of comhead is link to user, post time, link to comment | |
die "Bad comhead\n" if (@elements != 3); | |
my($author_tag) = $elements[0]; | |
my($age) = $elements[1]; | |
my($comment_link) = $elements[2]; | |
my($author_link) = $author_tag->attr('href'); | |
die "Bad author link\n" if (! $author_link); | |
$author_link = "$base/$author_link"; | |
@elements = $author_tag->content_list(); | |
die "Bad author tag\n" if (@elements != 1); | |
my($author_html) = ref $elements[0] ? $elements[0]->as_HTML() : $elements[0]; | |
my($author_name) = ref $elements[0] ? $elements[0]->as_text() : $elements[0]; | |
$comment_link = $comment_link->attr('href'); | |
die "Bad comment link\n" if (! $comment_link); | |
$comment_link = "$base/$comment_link"; | |
die "Bad comment age\n" if (ref $age); | |
die "Bad comment age\n" if ($age !~ s/\s*\|\s*$//); | |
# DMD | |
# my $date = Date::Manip::Date->new($age); | |
# die "Bad comment age\n" if ($date->err); | |
# $date->convert('UTC'); | |
# $date = $date->printf('%O+00:00'); | |
my $date = ParseDate($age); | |
die "Bad comment age\n" if (! $date); | |
$date = Date_ConvTZ($date, '', 'UTC'); | |
$date = UnixDate($date, '%O+00:00'); | |
$date = &get_date_cache($comment_link, $date); | |
@elements = $comment->{'comment'}->content_list(); | |
my $content = ''; | |
for (@elements) { | |
if (ref) { | |
$content .= $_->as_HTML(); | |
} | |
else { | |
$content .= $_; | |
} | |
} | |
my($title) = $comment->{'comment'}->as_text(); | |
if (length($title) > $title_limit) { | |
$title = substr($title, 0, $title_limit-3) . '...'; | |
} | |
$comment->{'author_link'} = $author_link; | |
$comment->{'author_html'} = $author_html; | |
$comment->{'author_name'} = $author_name; | |
$comment->{'age'} = $age; | |
$comment->{'date'} = $date; | |
$comment->{'comment_link'} = $comment_link; | |
$comment->{'content'} = $content; | |
$comment->{'title'} = $title; | |
} | |
&write_date_cache(); | |
@comments = sort { $b->{'date'} cmp $a->{'date'} } @comments; | |
my $rss = XML::RSS->new(); | |
$rss->channel(title => $title, link => $url); | |
for (@comments) { | |
my $creator_blob = "<a href='$_->{author_link}'>$_->{author_html}</a>"; | |
$rss->add_item | |
( | |
title => $_->{'title'}, | |
link => $_->{'comment_link'}, | |
description => $_->{'content'} . "<p>(by $creator_blob, $_->{'age'})</p>", | |
dc => { | |
date => $_->{'date'}, | |
creator => $_->{'author_name'}, | |
}); | |
} | |
print header('application/rss+xml'); | |
print $rss->as_string; | |
$tb->delete(); | |
sub read_date_cache { | |
return if (! $date_cache_file); | |
return if (! -f $date_cache_file); | |
open(CACHE ,'<', $date_cache_file) or die "Error reading date cache\n"; | |
while (<CACHE>) { | |
chomp; | |
my($url, $date, $last_used) = split; | |
$date_cache{$url} = { | |
date => $date, | |
last_used => $last_used | |
}; | |
} | |
close(CACHE) or die "Error reading date cache\n"; | |
} | |
sub get_date_cache { | |
my($url, $date) = @_; | |
$date_cache{$url}{'last_used'} = $now; | |
if ($date_cache{$url}{'date'}) { | |
return $date_cache{$url}{'date'}; | |
} | |
else { | |
return $date_cache{$url}{'date'} = $date; | |
} | |
} | |
sub write_date_cache { | |
return if (! $date_cache_file); | |
my $new = "$date_cache_file.new"; | |
open(CACHE, '>', $new) or die "Error writing date cache\n"; | |
foreach my $url (keys %date_cache) { | |
next if ($now - $date_cache{$url}->{'last_used'} > $date_cache_timeout); | |
print(CACHE "$url $date_cache{$url}{'date'} $date_cache{$url}{'last_used'}\n") | |
or die "Error writing date cache\n"; | |
} | |
close(CACHE) or die "Error writing date cache\n"; | |
rename($new, $date_cache_file) or die "Error writing date cache\n"; | |
} | |
# CHANGES | |
# | |
# 2013/02/01 - Don't barf on postings with no comments. | |
# 2013/01/24 - Creator name in the XML should not contain HTML. | |
# 2013/01/24 - Creator name in the XML should not be a link. | |
# 2013/01/24 - Initial release. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment