Skip to content

Instantly share code, notes, and snippets.

@joeromero
Forked from macias/CHANGELOG
Created September 2, 2013 22:05
Show Gist options
  • Save joeromero/6417680 to your computer and use it in GitHub Desktop.
Save joeromero/6417680 to your computer and use it in GitHub Desktop.
<?php
/* ===================== COMMON CODE ============================
This file is be used by two other scripts.
2013-07-15 * bash printing functions are wrapped into class
2013-06-29 * technical change to make c_get work
2013-05-15, 2 * polished code for other scripts needs
2013-05-15 * suppressed warning on lost internet connection
2013-05-06 * generating "wget" command with all variables passed
from shell
2013-05-03 * more shared code put in here
2013-04-24 * added "chapter" phrase for removal
2013-04-22 * added function for dropping a week/lecture phrase
* removing multiple whitespaces in a row
2013-04-15 * added Coursera CSS getters and more shared code
2013-02-15 * special function for whitespace removal
2013-02-11 * converting character 0xc2 (?) and hard space to space
2012-10-19 * question marks in filenames are converted to spaces
(because of MS-Windows)
2012-09-18 * initial release
================================================================== */
// ---- Coursera specific stuff ----------------------------------------
$split_dirs_key = '--split_dirs';
$drop_deco_key = '--drop_week';
$reverse_key = '--reverse';
$beep_key = '--beep';
$debug_key = '--debug';
$limit_key = '--limit';
$debug_page = NULL;
function c_query_groups($xpath)
{
return $xpath->query('//div[contains(@class,"course-item-list-header")]');
}
function drop_deco($name)
{
$s = $name;
$s = preg_replace('/^(Week|Lecture|Chapter)[\s]*\d+[.:\s\-]*/','',$s);
if ($s=='')
return $name;
else
return $s;
}
function c_query_dir($xpath,$group)
{
$dir = coursera_trim($xpath->query('./h3',$group)->item(0)->nodeValue);
return $dir;
}
function c_deco_dir($dir,$group_count,$drop_deco)
{
if ($drop_deco)
$dir = drop_deco($dir);
return str_pad($group_count,2,'0',STR_PAD_LEFT).'. '.fix_filename($dir);
}
function c_query_list($xpath,$group)
{
return $xpath->query('.//li',$group->nextSibling);
}
function c_query_row($xpath,$node,$drop_deco,&$row,&$title)
{
$row = $xpath->query('.//a[contains(@class,"lecture-link")]',$node)->item(0);
$title = fix_filename($row->firstChild->nodeValue);
if ($drop_deco)
$title = drop_deco($title);
}
function c_get_embedded_links($row,$ext,$session = NULL)
{
$frame = trim($row->attributes->getNamedItem('data-modal-iframe')->nodeValue);
// lectures in preview mode are put at external pages, so we have to download them extra
$view = get_page_xpath($frame,$session);
if (!$view)
return NULL;
else
{
$links = $view->query('//video[@id="QL_video_element_first"]/source[@type="video/'.$ext.'"]');
if ($links->length===0)
$links = $view->query('//div[@id="QL_player_container_first"]//source[@type="video/'.$ext.'"]');
return $links;
}
}
/*function c_get_embedded_links2($row,$ext,$session = NULL)
{
$frame = trim($row->attributes->getNamedItem('data-modal-iframe')->nodeValue);
// lectures in preview mode are put at external pages, so we have to download them extra
$view = get_page_xpath($frame,$session);
if (!$view)
return NULL;
else
return $view->query('//div[@id="QL_player_container_first"]//embed[@id="me_flash_0" and ends-with(@flashvars,".'.$ext.'")]');
}*/
function coursera_trim($s)
{
return trim(strtr($s,"\xa0\xc2",' '));
}
// ---- general php code -----------------------------------------------
class BashPrinter
{
private $dirLines = array();
public function wget_file_print($link,$target_filename,$log = NULL,$session = NULL)
{
foreach ($this->dirLines as $s)
echo $s;
$this->dirLines = array();
echo 'if [ ! -e "'.$target_filename.'" ] ; then'."\n";
echo ' wget $@ -nc --no-cookies ';
if ($session!==NULL)
echo ' --header "Cookie: session='.$session.'" ';
echo '"'.$link.'" -O "'.$target_filename.'"'."\n";
echo ' if [ $? -ne 0 ]'."\n";
echo ' then'."\n";
echo ' rm -f "'.$target_filename.'"; ERRORS=$((ERRORS+1))'."\n";
if ($log!==NULL)
{
echo ' else'."\n";
echo ' echo "'.$link.'" >> '.$log."\n";
}
echo ' fi'."\n";
echo 'fi'."\n";
}
public function mkdir_print($dir,$extras)
{
global $split_dirs_key;
$this->dirLines = array();
$this->dirLines[] = "\n";
if (array_key_exists($split_dirs_key,$extras))
{
foreach ($extras[$split_dirs_key] as $d)
$this->dirLines[] = 'mkdir -p "'.$d.'/'.$dir.'"'."\n";
}
else
$this->dirLines[] = 'mkdir "'.$dir.'"'."\n";
}
}
function process_extra_arguments(&$extras)
{
global $split_dirs_key;
if (array_key_exists($split_dirs_key,$extras))
$extras[$split_dirs_key] = explode(' ',$extras[$split_dirs_key]);
}
function get_dom($content)
{
$dom = new DOMDocument();
$errors_mode = libxml_use_internal_errors(TRUE);
$content = mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8");
$dom->loadHTML($content);
libxml_clear_errors();
libxml_use_internal_errors($errors_mode);
$dom->preserveWhiteSpace = false;
return $dom;
}
function fix_filename($s)
{
return preg_replace('/\s{2,}/', ' ',coursera_trim(strtr($s,'?:"/\\',' .\'__')));
}
function get_page_xpath($url,$session = NULL)
{
global $debug_page;
$http = array('method'=>'GET');
if ($session!==NULL)
$http['header'] = 'Cookie: session='.$session.';';
$context = stream_context_create(array('http'=> $http));
$content = @file_get_contents($url,false,$context);
$debug_page = $content;
if ($content===FALSE)
return NULL;
$dom = get_dom($content);
$xpath = new DOMXPath($dom);
return $xpath;
}
?>
<?php
/* ===================== COURSERA GETTER ============================
tags: [coursera video download] [coursera lecture download]
CHANGELOG:
---------
2013-08-06 * additional info from tooltip is processed to be used
as a filename
2013-07-15 * if there is nothing to download for given folder
it is not created
2013-06-29 * updated extraction for embedded videos
* added option to limit extraction from main page
2013-05-15, 2 * corrected reporting failed download or extraction
2013-05-15 * reporting failed extraction of main resource
2013-05-03 * more unification with c_preview utility -- ability to
download embedded videos as well (read: webm), add
"~" character before file extension to make Coursera
Getter fetch embedded video
* "--extension" option in no longer supported -- add
dot (".") before file extension instead
2013-04-22 * dropping a week/lecture phrase from filenames as well
2013-04-15 * refactoring
2013-04-07 * keeping log of downloads (file "downloads.log") as
countermeasure for renaming the lectures/notes
* added "beep" option to make a sound at the end of
downloading
2013-04-03 * bugfix: the title of lecture sometimes was ignored
2013-02-15 * more accurate whitespace removal
2013-02-02 * automatically removes corrupted files
2013-01-23 * new option "reverse" for the courses which put
sections in "from newest to oldest" order
2013-01-08 * Coursera changed its web format, along with structure
and CSS tags/classes this version hopefully is
changed to reflect all of those
2012-10-12 * added option --drop_week to drop "week X." part from
the directory (remind me this was supposed to be dead
simple tool ;-D)
2012-10-11 * added option --split_dirs to save files into
subdirectories according of files extensions
2012-10-03 * added option --extension to get resources by
extension of the files, not the tooltips
2012-09-18 * added c_common.php
* you can specify via filetypes what to grab and what
extension set
2012-07-05 * initial release of Coursera preview getter
2012-06-14 * added little control if there is insufficient number
of arguments
2012-06-11 * UTF-8 in filenames are supported
(another module for PHP is required -- mbstring)
* replaces slash and backslash with underscore
2012-06-07 * changed this info, added another way of getting
cookies
2012-06-06, 2 * extensions casing reverted -- they matter again
* directories are named according to Lectures sections
* handles multiple files for given file type
* files are counted within each directory, not within
entire course
2012-06-06 * extensions can be given lower/upper-case, they do not
matter
2012-06-05, 2 * creates weekly subdirectories and puts the files in
there
2012-06-05 * initial release
WHAT IT DOES:
------------
* it parses given course Lectures page
* it extracts all the desired content (links for videos, slides, etc)
* it uses consistent naming of the files
* it replaces colon with period (hello Windows users)
* it finally creates a bunch of wget command ready to execute
* it ignores already existing files, so it is safe to rerun wget
script just to get missing files (note this might be not true if you
update this script, because of possible change in naming convention)
WHAT YOU NEED:
-------------
1. proper shell (Windows users -- of course I recommend switching to
Linux entirely, but as a workaround Cygwin should be fine -- I
don't know how about the tools I mention below)
2. wget (in openSUSE `sudo zypper in wget`)
3. php5 (in openSUSE `sudo zypper in php5`)
4. php5-openssl (in openSUSE `sudo zypper in php5-openssl`)
5. php5-mbstring (in openSUSE `sudo zypper in php5-mbstring`)
6. and an adventurous soul -- in Firefox, go to
Edit/Preferences/Privacy/Remove Individual Cookie (don't freak
out!) search for "coursera". Several items should appear -- look
for key session for the site you would like to download (for
example "nlp"). Copy the value (content) of that key. Close the
preferences window (do **NOT** delete anything!) -- I will be
grateful for info if there is easier way
Ok, so now you know the address of the site, the session, and the files
you would like to download.
Jan de Vos sent another way for getting cookies (step 5):
* find the cookies directory -- in case of Linux it will be something
like this `~/.mozilla/firefox/88xw1k8g.default/`
* run sqlite3 -- `sqlite3 cookies.sqlite`
* run SQL query -- `select path,value from moz_cookies where
baseDomain = 'coursera.org' and name='session';`
You will get the session codes for all courses you are enrolled on.
USAGE:
-----
php c_get.php "link_to_lectures_page" "file types" "session code" > wget_script_name.sh
sh wget_script_name.sh
Example (this is one line):
php c_get.php "https://class.coursera.org/crypto/lecture/index" "MP4 PDF" "HERE&IS%MY&SESSION^VALUE@WHICH*OF!COURSE*I_WONT*TELL9YOU" > wgetter.sh
the one above creates appropriate script for wget for downloading videos
(MP4) and slides (PDF).
Please note the file type casing (MP4 vs. mp4) must match the casing of
the title (tooltip) of given category of files -- check the Lectures
page to find it out.
It is possible to pass file type in format "FileFormat=FileExtension",
so this script will look for one thing, but save as another. For example some courses list pdf files as "Slides". In such case pass such file format "Slides=pdf" -- this mean "Slides" will be grabbed, but saved with extension "pdf".
Some courses do not use consistent naming of tooltips (unfortunately),
in such case you can download files directly by extension -- add dot
(".") character in front of tile type. As previously, pay attention to
lowercase/uppercase (e.g. usually the extension is "mp4" but tooltip is
"MP4"). Example:
php c_get.php "https://class.coursera.org/scala/lecture/index" ".mp4 .pdf" "HERE&IS%MY&SESSION" > wgetter.sh
Yet another source of files are embedded frames (the ones when you click
to view lecture online). One of the advantages of this is ability to
download video in webm format. Instead of "." use now "~", for example:
php c_get.php "https://class.coursera.org/scala/lecture/index" "~webm .pdf" "HERE&IS%MY&SESSION" > wgetter.sh
NOTE: the video will be downloaded from embedded player, but handouts
(pdf) will be downloaded from download (resources) section.
If you would like to have notes in the "notes" subdirectory and lectures
in "lectures" one add "--split_dirs" argument in such way:
php c_get.php "https://class.coursera.org/scala/lecture/index" "mp4 pdf" "HERE&IS%MY&SESSION" --split_dirs="videos texts" > wgetter.sh
so "mp4" files will go into "videos" subdirectory and "pdf" files into
"texts" subdirectory.
If the directories with openining "Week X." seem redundant add
"--drop_week" option:
php c_get.php "https://class.coursera.org/scala/lecture/index" "mp4 pdf" "HERE&IS%MY&SESSION" --drop_week > wgetter.sh
Instead of having "02. Week 1: Functions & Evaluations" you will get
"02. Functions & Evaluations".
For courses which do not use natural order (from oldest to newest) there
is an option "reverse":
php c_get.php "https://class.coursera.org/scala/lecture/index" "mp4 pdf" "HERE&IS%MY&SESSION" --reverse > wgetter.sh
This will tell this script to use reversed order of numbering sections.
The courses with embedded videos are harder to process -- extraction
takes more time. If you know in advance that you don't want to extract
some portion of the lectures you can pass the limit option:
php c_get.php "https://class.coursera.org/scala/lecture/index" "mp4 pdf" "HERE&IS%MY&SESSION" --limit="Week 9" > wgetter.sh
This will start extraction from section containing phrase "Week 9". In
case of reversed order -- it will stop extraction on phrase "Week 9".
In all above examples, video lecture (mp4/webm) came first -- the
program assumes it is the main resource, and if it is missing it will
report this fact. It won't report missing resource of any other kind.
Once the actual getter script is created (here: wgetter.sh) you can pass
any extra option for "wget". For example you can run it as:
sh wgetter.sh --limit-rate=100k
This would limit speed of download to 100KB/s. See "man wget" for more
options.
*
SECURITY NOTE:
-------------
Do NOT share your session code with anyone, and this means -- do NOT
share the wget script with anyone as well!
================================================================== */
require_once 'c_common.php';
function print_wget($xpath,$session,$extensions,$extras)
{
global $split_dirs_key,$drop_deco_key,$reverse_key,$beep_key,$debug_key,$limit_key;
global $debug_page;
$bash_printer = new BashPrinter();
process_extra_arguments($extras);
// done with extra arguments ---------------------------------------
$downloads_filename = 'downloads.log';
$downloads = array();
if (file_exists($downloads_filename))
$downloads = file($downloads_filename, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
echo "ERRORS=0\n";
$group_list = c_query_groups($xpath);
$group_count = array_key_exists($reverse_key,$extras) ? $group_list->length : 1;
if (array_key_exists($reverse_key,$extras))
// in reverse order pretend limit was NOT hit
$limit_hit = false;
else
// if there is no limit given by user, pretend it was hit
$limit_hit = !array_key_exists($limit_key,$extras);
foreach ($group_list as $group)
{
$item_count = 0;
$dir = c_query_dir($xpath,$group);
if (!$limit_hit && array_key_exists($limit_key,$extras))
$limit_hit = (strpos($dir,$extras[$limit_key])!==FALSE);
$dir = c_deco_dir($dir,$group_count,array_key_exists($drop_deco_key,$extras));
$group_count += array_key_exists($reverse_key,$extras) ? -1 : +1;
if (array_key_exists($limit_key,$extras))
{
if (array_key_exists($reverse_key,$extras))
{
if ($limit_hit)
break;
}
elseif (!$limit_hit)
continue;
}
$bash_printer->mkdir_print($dir,$extras);
$node_list = c_query_list($xpath,$group);
foreach ($node_list as $node)
{
++$item_count;
c_query_row($xpath,$node,array_key_exists($drop_deco_key,$extras),$row,$title);
// each $ext_combo might be in such forms: either "FileType" or "FileType=FileExtension" (e.g. "PDF", "Slides=pdf")
for ($i_ext = 0; $i_ext < count($extensions); ++$i_ext)
{
$ext_parts = explode('=',$extensions[$i_ext]);
if (array_key_exists($split_dirs_key,$extras))
$target_dir = $extras[$split_dirs_key][$i_ext].'/'.$dir;
else
$target_dir = $dir;
$attr_extractor = 'href';
if ($ext_parts[0][0]=='~') // extract link by extension from viewer frame
{
$ext_parts[0] = substr($ext_parts[0],1);
$links = c_get_embedded_links($row,$ext_parts[0],$session);
if ($links===NULL)
{
file_put_contents('php://stderr', "Loading embedded frame failed: '$dir/$title'\n");
continue;
}
else if ($links->length===0 && $i_ext===0)
{
file_put_contents('php://stderr', "No resources '$ext_parts[0]' found for '$dir/$title'\n");
if (array_key_exists($debug_key,$extras))
file_put_contents('DEBUG_'.$title,$debug_page);
continue;
/* $links = c_get_embedded_links2($row,$ext_parts[0],$session);
if ($links===NULL)
{
file_put_contents('php://stderr', "Loading fallback embedded frame failed: '$dir/$title'\n");
continue;
}
else if ($links->length===0 && $i_ext===0)
{
file_put_contents('php://stderr', "No fallback resources '$ext_parts[0]' found for '$dir/$title'\n");
if (array_key_exists($debug_key,$extras))
file_put_contents('DEBUG_'.$title,$debug_page);
continue;
}
else
$attr_extractor = 'flashvars';*/
}
else
$attr_extractor = 'src';
if (array_key_exists($debug_key,$extras))
file_put_contents('php://stderr', "For $dir/$title ".$links->length." '$ext_parts[0]' links found.\n");
}
else if ($ext_parts[0][0]=='.') // extract link by extension of the linked file
{
$links = $xpath->query('.//div[@class="course-lecture-item-resource"]/a[contains(@href,"'.$ext_parts[0].'")]',$node);
$ext_parts[0] = substr($ext_parts[0],1);
}
else // extract link by tooltip of the link
$links = $xpath->query('.//a[contains(@title,"'.$ext_parts[0].'")]',$node);
$match = FALSE;
foreach ($links as $link)
{
$suffix = '';
if ($links->length>1)
$suffix = '.'.fix_filename($link->attributes->getNamedItem('title')->nodeValue);
$link = $link->attributes->getNamedItem($attr_extractor)->nodeValue;
if ($attr_extractor=='flashvars')
{
$url_idx = strpos($link,'&file=http');
$link = urldecode(substr($link,$url_idx+strlen('&file=')));
}
if (!in_array($link,$downloads))
{
$target_filename = $target_dir.'/'.str_pad($item_count,3,'0',STR_PAD_LEFT).'. '.$title.$suffix.'.'.strtolower(end(array_values($ext_parts)));
$bash_printer->wget_file_print($link,$target_filename,$downloads_filename,$session);
$match = TRUE;
}
else if (array_key_exists($debug_key,$extras))
file_put_contents('php://stderr', "$dir/$title '$ext_parts[0]' already downloaded.\n");
}
}
}
}
echo "\n";
echo 'if [ $ERRORS -ne 0 ] ; then echo "There were some errors while downloading. Run the script again." ; fi'."\n";
if (array_key_exists($beep_key,$extras))
echo "beep\n";
}
if ($argc<4)
{
file_put_contents('php://stderr', "Error: you should input minimum three arguments, the usage is:\n");
file_put_contents('php://stderr', "\"LECTURES_URL\" \"FILE_TYPES\" \"SESSION_CODE\" [--$beep_key] [--$reverse_key] [--$drop_deco_key] [--$split_dirs_key=\"directories per file type\"]\n");
}
else
{
array_shift($argv);
$url = array_shift($argv);
$extensions = explode(' ',array_shift($argv));
$session = array_shift($argv);
$extras = array();
foreach ($argv as $a)
{
$parts = explode('=',$a);
if (!in_array($parts[0],array($split_dirs_key,$drop_deco_key,$reverse_key,$beep_key,$debug_key,$limit_key)))
{
file_put_contents('php://stderr', 'Unknown extra argument "'.$parts[0]."\"\n");
exit(1);
}
$extras[$parts[0]] = count($parts)==1 ? NULL : $parts[1];
}
$xpath = get_page_xpath($url,$session);
if ($xpath!==NULL)
print_wget($xpath,$session,$extensions,$extras);
}
?>
<?php
/* ===================== COURSERA PREVIEW GETTER =======================
tags: [coursera video download] [coursera lecture download]
CHANGELOG:
---------
2013-07-15 * if there is nothing to download for given folder
it is not created
2013-06-29 * technical change to work with c_common
2013-05-15 * reporting failed download or extraction of main
resource
2013-05-03 * just keeping in sync with c_get
2013-04-22 * dropping a week/lecture phrase from filenames as well
2013-04-15 * update to follow last Coursera changes
2012-10-12 * sharing code with c_common.php;
some fixes to follow Coursera changes of preview sites
2012-07-05 * initial release
WHAT IT DOES:
------------
* it is counterpart for Coursera getter, but this one works only for course previews
-- the ones with embedded video player, and nothing else
WHAT YOU NEED:
-------------
1. proper shell (Windows users -- of course I recommend switching to Linux entirely, but as a workaround Cygwin should be fine -- I don't know how about the tools I mention below)
2. wget (in openSUSE `sudo zypper in wget`)
3. php5 (in openSUSE `sudo zypper in php5`)
4. php5-mbstring (in openSUSE `sudo zypper in php5-mbstring`)
USAGE:
-----
php c_preview.php "link_to_preview_page" "video_file_type" > wget_script_name.sh
sh wget_script_name.sh
Example (this is one line):
php c_preview.php "https://class.coursera.org/crypto-preview/lecture/index" "mp4"
the one above creates appropriate script for wget for downloading videos (MP4). Now execute
sh wgetter.sh
Please note the file type is not guaranteed to exists on the server
(so far "webm" and "mp4" are supported by Coursera).
================================================================== */
require_once 'c_common.php';
// https://class.coursera.org/machlearning-001/lecture/preview/index
function print_wget($xpath,$ext,$extras)
{
global $split_dirs_key,$drop_deco_key,$extension_key,$reverse_key,$beep_key;
$bash_printer = new BashPrinter();
process_extra_arguments($extras);
$group_count = 0;
echo "ERRORS=0\n";
$group_list = c_query_groups($xpath);
$group_count = array_key_exists($reverse_key,$extras) ? $group_list->length : 1;
foreach ($group_list as $group)
{
$item_count = 0;
$dir = c_deco_dir(c_query_dir($xpath,$group),$group_count,array_key_exists($drop_deco_key,$extras));
$group_count += array_key_exists($reverse_key,$extras) ? -1 : +1;
$bash_printer->mkdir_print($dir,$extras);
// get the list of all lectures within current group (week)
$node_list = c_query_list($xpath,$group);
foreach ($node_list as $node)
{
++$item_count;
c_query_row($xpath,$node,array_key_exists($drop_deco_key,$extras),$row,$title);
$video_list = c_get_embedded_links($row,$ext);
if ($video_list===NULL)
{
file_put_contents('php://stderr', "Loading embedded frame failed: '$dir/$title'\n");
continue;
}
else if ($video_list->length==0)
{
file_put_contents('php://stderr', "Filetype '$ext' not found for '".$title."'\n");
continue;
}
$video = $video_list->item(0);
$vid_src = $video->attributes->getNamedItem('src')->nodeValue;
$bash_printer->wget_file_print($vid_src,$dir.'/'.str_pad($item_count,3,'0',STR_PAD_LEFT).'. '.$title.'.'.strtolower($ext));
}
}
echo 'if [ $ERRORS -ne 0 ] ; then echo "There were some errors while downloading. Run the script again." ; fi'."\n";
if (array_key_exists($beep_key,$extras))
echo "beep\n";
}
if ($argc<3)
{
file_put_contents('php://stderr', "Error: you should input minimum two arguments, the usage is:\n");
file_put_contents('php://stderr', "\"LECTURES_URL\" \"FILE_TYPES\" [--$beep_key] [--$reverse_key] [--$drop_deco_key] [--$split_dirs_key=\"directories per file type\"]\n");
}
else
{
array_shift($argv);
$url = array_shift($argv);
$extensions = array_shift($argv);
$extras = array();
foreach ($argv as $a)
{
$parts = explode('=',$a);
if (!in_array($parts[0],array($split_dirs_key,$drop_deco_key,$reverse_key,$beep_key)))
{
file_put_contents('php://stderr', 'Unknown extra argument "'.$parts[0]."\"\n");
exit(1);
}
$extras[$parts[0]] = count($parts)==1 ? NULL : $parts[1];
}
$xpath = get_page_xpath($url);
if ($xpath!==NULL)
print_wget($xpath,$extensions,$extras);
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment