Skip to content

Twitter のツイート保存 (save_user_timeline.pl)

by wetcradle on 9月 26th, 2012

以前紹介した save_user_timeline.pl ですが、Twitter の HTML を舐めていたので、Twitter の更新によって動かなくなっていました。
というわけで save_user_timeline.pl を現状に Twitter に対応させました。
ちなみに、cron に登録して、定期的にツイートを保存していくような用途を前提として考えています。

#!/usr/bin/perl

use strict;
use warnings;
use LWP::UserAgent;
use HTTP::Request;
use Date::Parse;
use HTML::TreeBuilder;
use Encode;
use JSON qw(decode_json);
use List::Util qw(max);

########## config ##########
my $TWITTER_URL = "http://twitter.com/";
my $DIR = "/path/to/save_dir";
my $RETRY_DELAY = 5;
my $RETRY_MAX_COUNT = 5;

########## main ##########
my $retry_count = 0;
main: {
	if ($#ARGV != 0) {
		die "Usage: $0 screen_name";
	}
	
	my $screen_name = $ARGV[0];
	if ($screen_name =~ /[^a-zA-Z0-9_]/) {
		die "illegal screen_name";
	}
	
	print STDERR "$screen_name\n";
	
	`mkdir -p $DIR`;
	
	my $proxy = new LWP::UserAgent;
	
	my $user_id = &get_user_id($TWITTER_URL, $screen_name, $proxy);
	print STDERR "($user_id)\t";
	
	my $post_hash = &load_saved_timeline($DIR, $user_id);
	
	my $add_sum = &load_timeline($screen_name, $post_hash, $proxy);
	print STDERR "($add_sum)\n";
	
	&save_timeline($DIR, $user_id, $post_hash);
}

########## save_timeline ##########
sub save_timeline($$) {
	my ($dir, $user_id, $post_hash) = @_;
	open(POSTS, ">$dir/$user_id") || die "open failed";
	foreach my $id (sort {$a <=> $b} keys %{$post_hash}) {
		my $post = $post_hash->{$id};
		print POSTS $post->{"id"},",",$post->{"time"},",",$post->{"screen_name"},",",$post->{"data"},"\n";
	}
	close(POSTS);
}

########## load_timeline ##########
sub load_timeline($$) {
	my ($screen_name, $post_hash, $proxy) = @_;
	my $last_id = max(keys %{$post_hash});
	my $add_sum = 0;
	my $max_id = undef;
	my $has_more_items = 1;
	TIMELINES: while ($has_more_items) {
		print STDERR ".";
		my $timeline_url = &get_timeline_url($screen_name, $max_id);
		my $req = HTTP::Request->new('GET' => $timeline_url);
		my $res = $proxy->request($req);
		if (!$res->is_success) {
			if ($res->code == 404) {
				die "not found";
			}
			&inc_retry_count() || last;
			next;
		}
		my $result = $res->content();
		my $json = decode_json($result);
		$has_more_items = $json->{"has_more_items"};
		my $items_html = $json->{"items_html"};
		
		my $tree = HTML::TreeBuilder->new;
		$tree->parse($items_html);
		my @tweet_divs = $tree->look_down("data-tweet-id", qr/.+/);
		if (!scalar(@tweet_divs)) {
			last;
		}
		foreach my $tweet_div (@tweet_divs) {
			my $post = {};
			$post->{"screen_name"} = $screen_name;
			my $retweet_id = $tweet_div->attr("data-retweet-id");
			$post->{"id"} =  $retweet_id || $tweet_div->attr("data-tweet-id");
			my ($time_span) = $tweet_div->look_down("data-time", qr/.+/);
			$post->{"time"} = $time_span->attr("data-time");
			my ($tweet_text_p) = $tweet_div->look_down("class", "js-tweet-text");
			my $data = encode('utf-8', $tweet_text_p->as_text);
			$data =~ s/(^ | $)//g;
			$post->{"data"} = $retweet_id ? "RT @" . $tweet_div->attr("data-screen-name") . ": " . $data : $data;
			$data =~ s/,/<comma>/g;
			$data =~ s/\r/<cr>/g;
			$data =~ s/\n/<lf>/g;
			if ($post->{"id"} <= $last_id) {
				last TIMELINES;
			}
			$post_hash->{$post->{"id"}} = $post;
			$max_id = $post->{"id"} - 1;
			$add_sum++;
		}
		$tree = $tree->delete;
	}
	return $add_sum;
}

########## load_saved_timeline ##########
sub load_saved_timeline($$) {
	my ($dir, $user_id) = @_;
	my %post_hash = ();
	if ( -e "$dir/$user_id" ) {
		open(POSTS, "<$dir/$user_id") || die "open failed";
		my @post_array = <POSTS>;
		close(POSTS);
		foreach my $line (@post_array) {
			chomp($line);
			my $post = {};
			($post->{"id"}, $post->{"time"}, $post->{"screen_name"}, $post->{"data"}) = split(/,/, $line);
			$post_hash{$post->{"id"}} = $post;
		}
	}
	return \%post_hash;
}

########## get_user_id ##########
sub get_user_id($$$) {
	my ($twitter_url, $screen_name, $proxy) = @_;
	my $request_url = $twitter_url.$screen_name;
	my $req = HTTP::Request->new('GET' => $request_url);
	my $res = $proxy->request($req);
	if (!$res->is_success) {
		if ($res->code == 404) {
			die "not found";
		}
		&inc_retry_count() || die "failed";
		next;
	}
	my $result = $res->content();
	#print $result;
	if (index($result, "class=\"protected-box\"") != -1) {
		die "protected user";
	}
	my $tree = HTML::TreeBuilder->new;
	$tree->parse($result);
	my ($profile_div) = $tree->look_down("class", "profile-card-inner");
	my $user_id = $profile_div->attr("data-user-id");
	$tree = $tree->delete;
	return $user_id;
}

########## get_timeline_url ##########
sub get_timeline_url($$) {
	my ($screen_name, $max_id) = @_;
	if (defined($max_id)) {
		return "http://twitter.com/i/profiles/show/$screen_name/timeline?include_available_features=1&include_entities=1&max_id=$max_id";
	}
	return "http://twitter.com/i/profiles/show/$screen_name/timeline?include_available_features=1&include_entities=1";
}

########## inc_retry_count ##########
sub inc_retry_count(){
	$retry_count++;
	if ($retry_count > $RETRY_MAX_COUNT) {
		warn "failed";
		return 0;
	}
	#warn "retry $retry_count";
	sleep($RETRY_DELAY);
	return 1;
}

From → perl, Twitter

No comments yet

Leave a Reply

Note: XHTML is allowed. Your email address will never be published.

Subscribe to this comment feed via RSS