| [ | mood |
| | creative | ] |
I've written a program (a hack, really), that will take a particular LJ username and return a list of that username's friend's interests, sorted by the most common interests. For example, 'publius_ovidius' returns something like:
dancing 21
movies 19
reading 18
sex 17
portland 16
music 14
travel 14
etc.
However, there are a couple of problems with the program. First, it's highly dependant on the structure of the LJ HTML, thus making it fragile. Are there RSS feeds or something similar of the full user info page?
There are a few other "issues" with this script, but I don't want to keep working on it until I know how to play nicely with LJ.
#!/usr/local/bin/perl
use warnings;
use strict;
use WWW::Mechanize;
use URI::Escape;
use HTML::Entities 'decode_entities';
use HTML::TokeParser::Simple;
use Getopt::Long;
$|++; # so we can see the results as they are printed
GetOptions(
'help|?' => \&usage,
'user=s' => \my $user,
'verbose=i' => \my $VERBOSE,
'syndicated' => \my $INCLUDE_SYNDICATED,
'communities' => \my $INCLUDE_COMMUNITIES,
);
sub usage
{
print <<" END_USAGE";
$0 will calculate the number of interests that the friends for
a given user has.
$0 --user [options]
--help Display this information and exit
--? Same as '--help'
--user Mandatory. This is the user we will fetch friends for.
--syndicated Include syndicated "friends" (e.g., doonesbury)
--communities Include communities
--verbose Takes an integer (0, 1, or 2). If zero, will print nothing
but the interest list (this is the default). 1 and 2 will
print more and more information. These are useful to let you
know the program has not "hung" if you're working with a
large list or over a slow connection.
Example:
$0 --user publius_ovidius --verbose 2 --communities
That will calculate the common interests for friends of publius_ovidius,
displaying verbose information and includes community interests (but does
not include syndicated feed interests).
Note that arguments may be abbreviated to the first letter. The above
command may be written as:
$0 -u publius_ovidius -v 2 -c
END_USAGE
exit;
}
# bad regexes. Need to improve them!
use constant FRIENDS => {
regex => qr{href='http://www\.livejournal\.com/users/[^/]+/friends'},
label => 'friends',
instance => qr{^/userinfo.bml\?user=(.*)$},
};
use constant INTERESTS => {
regex => qr{href='/interests\.bml'},
label => 'interests',
instance => qr{^/interests.bml\?int=(.*)$},
};
my $MECH = WWW::Mechanize->new;
$user ||= die "You must supply an LJ username";
print "Fetching user info for ($user) ...\n" if $VERBOSE;
my $html = get_user_info($user);
print "Fetching friends list for ($user)...\n" if $VERBOSE;
my $users = get_list($html,FRIENDS);
my $current = 1;
my $count = @$users;
my %sections;
foreach my $user (@$users) {
print "Fetching $user: $current out of $count\n" if $VERBOSE;
sleep 1; # be nice to their server
$current++;
print "Fetching user info for ($user) ...\n" if $VERBOSE > 1;
my $html = get_user_info($user);
next unless $html;
print "Fetching interests for ($user) ...\n" if $VERBOSE > 1;
my $interests = get_list($html, INTERESTS);
foreach my $interest (@$interests) {
$sections{$interest}++;
}
}
my @results =
sort { $b->[1] <=> $a->[1] }
map { [$_, $sections{$_}] }
keys %sections;
foreach my $interest (@results) {
printf "%30s %d\n", @$interest;
}
sub get_list
{
my ($html,$section) = @_;
my $parser = HTML::TokeParser::Simple->new(\$html);
while (my $token = $parser->get_token) {
next unless $token->as_is =~ /$section->{regex}/;
last;
}
$parser->get_tag('td'); # advance to first td tag
my @sections;
while (my $token = $parser->get_token) {
last if $token->is_end_tag('td'); # we're at the end of the member table data element
next unless $token->is_start_tag('a');
if ($token->return_attr->{href} =~ $section->{instance}) {
push @sections => decode_entities($1);
}
}
printf("\t%d %s found\n", scalar @sections, $section->{label})
if $VERBOSE > 1;
return \@sections;
}
sub get_user_info {
my $user = shift;
my $info = sprintf "http://www.livejournal.com/userinfo.bml?user=%s&mode=full"
=> uri_escape($user);
my $page = $MECH->get($info);
my $html = $MECH->content;
if ('Error' eq $MECH->title && $html =~ /Unknown user/) {
# this isn't perfect, but it's reasonable since LJ does
# not return error codes
warn "User ($user) not found";
return;
}
if ($MECH->title =~ /Syndicated Account/ && ! $INCLUDE_SYNDICATED) {
print "\tSkipping syndicated account ($user)\n" if $VERBOSE;
return;
}
if ($MECH->title =~ /Community Info/ && ! $INCLUDE_COMMUNITIES) {
print "\tSkipping community ($user)\n" if $VERBOSE;
return;
}
return $html;
} |