Ep Guides Reminder
After regularly missing my favourite American TV shows, I decided to write a little script to remind me to watch them.
It scrapes data from epguides.com and forms an RSS feed based on what shows are going to be released in the next week (this can be changed in the parameters). Due to the scraping being pretty slow, and not wanting to use too much bandwidth, I setup a cron job and piped the output of the script to a file that I’ve added to my iGoogle homepage.
Now whenever I open my homepage I can instantly see what shows that I like are going to be aired in the coming weeks!
Screen scraping is never ideal, but it usually works. Rather conveniently the format of epguides pages are very much table like, in order to save writing many regular expressions that may catch the wrong data when information is missing I simply split the data into its columns and rows. I simply match the correct div element and then process each line by splitting it into sub strings. This isn’t the most efficient way, but since this only runs daily in a cron job, its fine for me.
-
<?php
-
$urlPrefix = "http://epguides.com/";
-
-
$toFetch = array("Prison Break" => "PrisonBreak", "Dexter" => "Dexter", "Lost" => "Lost", "NUMB3RS" => "NUMB3RS", "House, M.D." => "House", "Family Guy" => "FamilyGuy", "American Dad" => "AmericanDad", "South Park" => "SouthPark"); //the shows for the feed
-
-
$dateFormat = "l jS F Y";
-
-
$minAge = time() - 172800; // 2 days ago
-
$maxAge = time() + 604800; //one week from today
-
-
-
require("downloader.class.php");
-
header('Content-type: application/xml; charset="utf-8"', true);
-
$list = array();
-
-
foreach ($toFetch as &$url)
-
{
-
$url = $urlPrefix . $url . '/';
-
-
set_time_limit(20);
-
$file = getFile($url);
-
$arr = performMatch($file);
-
$show = $arr[0];
-
$arr = filter_old($arr[1]);
-
-
$list [$show]= $arr;
-
}
-
-
$dom = new domDocument;
-
$dom->loadXML('<rss version="0.92">
-
<channel>
-
<title>Upcoming TV Shows</title>
-
<description>The latest TV shows from epguides.com</description>
-
<link>http://epguides.com</link>
-
</channel>
-
</rss>');
-
if (!$dom)
-
{
-
echo 'Error while parsing the document';
-
exit;
-
}
-
-
$mainRoot = simplexml_import_dom($dom);
-
$root = $mainRoot->channel[0];
-
-
$byDate = array();
-
foreach ($list as $id => $show)
-
{
-
if (count($show) > 0)
-
-
foreach ($show as $ep)
-
{
-
$ep ["show"]= $id;
-
$byDate [$ep['air-date']][] = $ep;
-
}
-
}
-
-
foreach ($byDate as $id => $date)
-
{
-
$showElement = $root->addChild('item');
-
$showElement->addChild('title', date($dateFormat, $id));
-
$showElement->addChild('pubDate', date(DATE_RFC822));
-
-
$desStr = "";
-
foreach ($date as $ep)
-
{
-
if (preg_match("/([0-9]*)\-([0-9]*)/", $ep['ep-season'], $epSeasonSplit))
-
{
-
$season = sprintf("%02s", $epSeasonSplit[1]);
-
$episode = sprintf("%02s", $epSeasonSplit[2]);
-
$seString = ' S'.$season.'E'.$episode;
-
$desStr .= $seString . " - ";
-
}
-
else
-
$desStr .= $ep['ep-season'] . " - ";
-
-
$desStr .= '<a href="'.$toFetch[$ep['show']].'">'.$ep['show'].'</a> - <a href="'.$ep['link'].'">'.$ep['title']."</a><br />";
-
}
-
-
$showElement->addChild('description', $desStr);
-
}
-
-
echo($mainRoot->asXML());
-
-
function getFile($url)
-
{
-
$downloader = new downloader();
-
$downloader->clearCache($url);
-
return $downloader->get($url);
-
}
-
-
function filter_old($arr)
-
{
-
global $maxAge, $minAge;
-
$ret = array();
-
$time = $minAge;
-
$nextWeek = $maxAge;
-
-
foreach ($arr as $row)
-
if ( ($row['air-date'] >= $time) && ($row['air-date'] <= $nextWeek) )
-
$ret []= $row;
-
-
return $ret;
-
}
-
-
function performMatch($file)
-
{
-
$matches = array();
-
$epTableRegex = '/<div id="eplist">.*<pre>(.*)<\/pre>.*<\/div>/isU';
-
$hLinkRegex = '/<a target="[^"]*" href="([^"]*)">([^<]*)<\/a>/';
-
$titleRegex = '/<h1><a href="([^"]*)">([^<]*)<\/a><\/h1>/';
-
-
preg_match($titleRegex, $file, $titleMatches);
-
preg_match($epTableRegex, $file, $matches);
-
-
$showTitle = ($titleMatches[2]);
-
$ep_table = trim($matches[1]);
-
$ep_arr = split("\n", $ep_table);
-
$episodes = array();
-
-
$split_line = null;
-
foreach ($ep_arr as $ep)
-
{
-
$e = str_replace(" ", "", $ep);
-
if (strlen($e) == count(split("_", $e)))
-
{
-
$split_line = $ep;
-
break;
-
}
-
}
-
$col_lengths = array(0);
-
if ($split_line !=null)
-
{
-
$split_arr = split (" ", $split_line);
-
foreach ($split_arr as $split_len)
-
{
-
$len = strlen($split_len);
-
if ($len > 0)
-
$col_lengths []= $len;
-
}
-
-
-
for ($i =1; $i<count($col_lengths); $i++)
-
{
-
$currentCol = $col_lengths[$i] + $col_lengths[$i-1];
-
$col_lengths[$i] = $currentCol +1;
-
}
-
-
$part_arr = array("ep-num", "ep-season", "prod-num", "air-date", "title");
-
$table_arr = array();
-
foreach ($ep_arr as $line)
-
{
-
$line_arr = array();
-
for ($i = 1; $i<count($col_lengths); $i++)
-
{
-
$start = $col_lengths[$i-1];
-
$end = ($col_lengths[$i]-$start);
-
if ($i == (count($col_lengths)-1))
-
$end = strlen($line)-$start;
-
-
$str = substr($line, $start, $end);
-
$line_arr [$part_arr[$i-1]]= trim($str);
-
}
-
$table_arr [] = $line_arr;
-
}
-
-
foreach ($table_arr as $row)
-
{
-
$airdate = strtotime ($row["air-date"]);
-
preg_match($hLinkRegex, $row["title"], $matches);
-
$title = $matches[2];
-
$link = $matches[1];
-
$epNum = str_replace(".", "", $row["ep-num"]);
-
$epSeasonNum = str_replace(" ", "", $row["ep-season"]);
-
$prodNum = $row["prod-num"];
-
-
if (($epNum != null) && ($airdate != null))
-
$episodes []= array("ep-num" => $epNum, "ep-season" => $epSeasonNum, "prod-num"=>$prodNum, "air-date"=>$airdate, "title"=>$title, "link"=>$link);
-
}
-
}
-
return array($showTitle, $episodes);
-
}
-
?>
The downloader class simply grabs and manages files for local scraping.