-
-
Save joeromero/6417680 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* ===================== COMMON CODE ============================ | |
This file is be used by two other scripts. | |
2013-07-15 * bash printing functions are wrapped into class | |
2013-06-29 * technical change to make c_get work | |
2013-05-15, 2 * polished code for other scripts needs | |
2013-05-15 * suppressed warning on lost internet connection | |
2013-05-06 * generating "wget" command with all variables passed | |
from shell | |
2013-05-03 * more shared code put in here | |
2013-04-24 * added "chapter" phrase for removal | |
2013-04-22 * added function for dropping a week/lecture phrase | |
* removing multiple whitespaces in a row | |
2013-04-15 * added Coursera CSS getters and more shared code | |
2013-02-15 * special function for whitespace removal | |
2013-02-11 * converting character 0xc2 (?) and hard space to space | |
2012-10-19 * question marks in filenames are converted to spaces | |
(because of MS-Windows) | |
2012-09-18 * initial release | |
================================================================== */ | |
// ---- Coursera specific stuff ---------------------------------------- | |
$split_dirs_key = '--split_dirs'; | |
$drop_deco_key = '--drop_week'; | |
$reverse_key = '--reverse'; | |
$beep_key = '--beep'; | |
$debug_key = '--debug'; | |
$limit_key = '--limit'; | |
$debug_page = NULL; | |
function c_query_groups($xpath) | |
{ | |
return $xpath->query('//div[contains(@class,"course-item-list-header")]'); | |
} | |
function drop_deco($name) | |
{ | |
$s = $name; | |
$s = preg_replace('/^(Week|Lecture|Chapter)[\s]*\d+[.:\s\-]*/','',$s); | |
if ($s=='') | |
return $name; | |
else | |
return $s; | |
} | |
function c_query_dir($xpath,$group) | |
{ | |
$dir = coursera_trim($xpath->query('./h3',$group)->item(0)->nodeValue); | |
return $dir; | |
} | |
function c_deco_dir($dir,$group_count,$drop_deco) | |
{ | |
if ($drop_deco) | |
$dir = drop_deco($dir); | |
return str_pad($group_count,2,'0',STR_PAD_LEFT).'. '.fix_filename($dir); | |
} | |
function c_query_list($xpath,$group) | |
{ | |
return $xpath->query('.//li',$group->nextSibling); | |
} | |
function c_query_row($xpath,$node,$drop_deco,&$row,&$title) | |
{ | |
$row = $xpath->query('.//a[contains(@class,"lecture-link")]',$node)->item(0); | |
$title = fix_filename($row->firstChild->nodeValue); | |
if ($drop_deco) | |
$title = drop_deco($title); | |
} | |
function c_get_embedded_links($row,$ext,$session = NULL) | |
{ | |
$frame = trim($row->attributes->getNamedItem('data-modal-iframe')->nodeValue); | |
// lectures in preview mode are put at external pages, so we have to download them extra | |
$view = get_page_xpath($frame,$session); | |
if (!$view) | |
return NULL; | |
else | |
{ | |
$links = $view->query('//video[@id="QL_video_element_first"]/source[@type="video/'.$ext.'"]'); | |
if ($links->length===0) | |
$links = $view->query('//div[@id="QL_player_container_first"]//source[@type="video/'.$ext.'"]'); | |
return $links; | |
} | |
} | |
/*function c_get_embedded_links2($row,$ext,$session = NULL) | |
{ | |
$frame = trim($row->attributes->getNamedItem('data-modal-iframe')->nodeValue); | |
// lectures in preview mode are put at external pages, so we have to download them extra | |
$view = get_page_xpath($frame,$session); | |
if (!$view) | |
return NULL; | |
else | |
return $view->query('//div[@id="QL_player_container_first"]//embed[@id="me_flash_0" and ends-with(@flashvars,".'.$ext.'")]'); | |
}*/ | |
function coursera_trim($s) | |
{ | |
return trim(strtr($s,"\xa0\xc2",' ')); | |
} | |
// ---- general php code ----------------------------------------------- | |
class BashPrinter | |
{ | |
private $dirLines = array(); | |
public function wget_file_print($link,$target_filename,$log = NULL,$session = NULL) | |
{ | |
foreach ($this->dirLines as $s) | |
echo $s; | |
$this->dirLines = array(); | |
echo 'if [ ! -e "'.$target_filename.'" ] ; then'."\n"; | |
echo ' wget $@ -nc --no-cookies '; | |
if ($session!==NULL) | |
echo ' --header "Cookie: session='.$session.'" '; | |
echo '"'.$link.'" -O "'.$target_filename.'"'."\n"; | |
echo ' if [ $? -ne 0 ]'."\n"; | |
echo ' then'."\n"; | |
echo ' rm -f "'.$target_filename.'"; ERRORS=$((ERRORS+1))'."\n"; | |
if ($log!==NULL) | |
{ | |
echo ' else'."\n"; | |
echo ' echo "'.$link.'" >> '.$log."\n"; | |
} | |
echo ' fi'."\n"; | |
echo 'fi'."\n"; | |
} | |
public function mkdir_print($dir,$extras) | |
{ | |
global $split_dirs_key; | |
$this->dirLines = array(); | |
$this->dirLines[] = "\n"; | |
if (array_key_exists($split_dirs_key,$extras)) | |
{ | |
foreach ($extras[$split_dirs_key] as $d) | |
$this->dirLines[] = 'mkdir -p "'.$d.'/'.$dir.'"'."\n"; | |
} | |
else | |
$this->dirLines[] = 'mkdir "'.$dir.'"'."\n"; | |
} | |
} | |
function process_extra_arguments(&$extras) | |
{ | |
global $split_dirs_key; | |
if (array_key_exists($split_dirs_key,$extras)) | |
$extras[$split_dirs_key] = explode(' ',$extras[$split_dirs_key]); | |
} | |
function get_dom($content) | |
{ | |
$dom = new DOMDocument(); | |
$errors_mode = libxml_use_internal_errors(TRUE); | |
$content = mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8"); | |
$dom->loadHTML($content); | |
libxml_clear_errors(); | |
libxml_use_internal_errors($errors_mode); | |
$dom->preserveWhiteSpace = false; | |
return $dom; | |
} | |
function fix_filename($s) | |
{ | |
return preg_replace('/\s{2,}/', ' ',coursera_trim(strtr($s,'?:"/\\',' .\'__'))); | |
} | |
function get_page_xpath($url,$session = NULL) | |
{ | |
global $debug_page; | |
$http = array('method'=>'GET'); | |
if ($session!==NULL) | |
$http['header'] = 'Cookie: session='.$session.';'; | |
$context = stream_context_create(array('http'=> $http)); | |
$content = @file_get_contents($url,false,$context); | |
$debug_page = $content; | |
if ($content===FALSE) | |
return NULL; | |
$dom = get_dom($content); | |
$xpath = new DOMXPath($dom); | |
return $xpath; | |
} | |
?> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* ===================== COURSERA GETTER ============================ | |
tags: [coursera video download] [coursera lecture download] | |
CHANGELOG: | |
--------- | |
2013-08-06 * additional info from tooltip is processed to be used | |
as a filename | |
2013-07-15 * if there is nothing to download for given folder | |
it is not created | |
2013-06-29 * updated extraction for embedded videos | |
* added option to limit extraction from main page | |
2013-05-15, 2 * corrected reporting failed download or extraction | |
2013-05-15 * reporting failed extraction of main resource | |
2013-05-03 * more unification with c_preview utility -- ability to | |
download embedded videos as well (read: webm), add | |
"~" character before file extension to make Coursera | |
Getter fetch embedded video | |
* "--extension" option in no longer supported -- add | |
dot (".") before file extension instead | |
2013-04-22 * dropping a week/lecture phrase from filenames as well | |
2013-04-15 * refactoring | |
2013-04-07 * keeping log of downloads (file "downloads.log") as | |
countermeasure for renaming the lectures/notes | |
* added "beep" option to make a sound at the end of | |
downloading | |
2013-04-03 * bugfix: the title of lecture sometimes was ignored | |
2013-02-15 * more accurate whitespace removal | |
2013-02-02 * automatically removes corrupted files | |
2013-01-23 * new option "reverse" for the courses which put | |
sections in "from newest to oldest" order | |
2013-01-08 * Coursera changed its web format, along with structure | |
and CSS tags/classes this version hopefully is | |
changed to reflect all of those | |
2012-10-12 * added option --drop_week to drop "week X." part from | |
the directory (remind me this was supposed to be dead | |
simple tool ;-D) | |
2012-10-11 * added option --split_dirs to save files into | |
subdirectories according of files extensions | |
2012-10-03 * added option --extension to get resources by | |
extension of the files, not the tooltips | |
2012-09-18 * added c_common.php | |
* you can specify via filetypes what to grab and what | |
extension set | |
2012-07-05 * initial release of Coursera preview getter | |
2012-06-14 * added little control if there is insufficient number | |
of arguments | |
2012-06-11 * UTF-8 in filenames are supported | |
(another module for PHP is required -- mbstring) | |
* replaces slash and backslash with underscore | |
2012-06-07 * changed this info, added another way of getting | |
cookies | |
2012-06-06, 2 * extensions casing reverted -- they matter again | |
* directories are named according to Lectures sections | |
* handles multiple files for given file type | |
* files are counted within each directory, not within | |
entire course | |
2012-06-06 * extensions can be given lower/upper-case, they do not | |
matter | |
2012-06-05, 2 * creates weekly subdirectories and puts the files in | |
there | |
2012-06-05 * initial release | |
WHAT IT DOES: | |
------------ | |
* it parses given course Lectures page | |
* it extracts all the desired content (links for videos, slides, etc) | |
* it uses consistent naming of the files | |
* it replaces colon with period (hello Windows users) | |
* it finally creates a bunch of wget command ready to execute | |
* it ignores already existing files, so it is safe to rerun wget | |
script just to get missing files (note this might be not true if you | |
update this script, because of possible change in naming convention) | |
WHAT YOU NEED: | |
------------- | |
1. proper shell (Windows users -- of course I recommend switching to | |
Linux entirely, but as a workaround Cygwin should be fine -- I | |
don't know how about the tools I mention below) | |
2. wget (in openSUSE `sudo zypper in wget`) | |
3. php5 (in openSUSE `sudo zypper in php5`) | |
4. php5-openssl (in openSUSE `sudo zypper in php5-openssl`) | |
5. php5-mbstring (in openSUSE `sudo zypper in php5-mbstring`) | |
6. and an adventurous soul -- in Firefox, go to | |
Edit/Preferences/Privacy/Remove Individual Cookie (don't freak | |
out!) search for "coursera". Several items should appear -- look | |
for key session for the site you would like to download (for | |
example "nlp"). Copy the value (content) of that key. Close the | |
preferences window (do **NOT** delete anything!) -- I will be | |
grateful for info if there is easier way | |
Ok, so now you know the address of the site, the session, and the files | |
you would like to download. | |
Jan de Vos sent another way for getting cookies (step 5): | |
* find the cookies directory -- in case of Linux it will be something | |
like this `~/.mozilla/firefox/88xw1k8g.default/` | |
* run sqlite3 -- `sqlite3 cookies.sqlite` | |
* run SQL query -- `select path,value from moz_cookies where | |
baseDomain = 'coursera.org' and name='session';` | |
You will get the session codes for all courses you are enrolled on. | |
USAGE: | |
----- | |
php c_get.php "link_to_lectures_page" "file types" "session code" > wget_script_name.sh | |
sh wget_script_name.sh | |
Example (this is one line): | |
php c_get.php "https://class.coursera.org/crypto/lecture/index" "MP4 PDF" "HERE&IS%MY&SESSION^VALUE@WHICH*OF!COURSE*I_WONT*TELL9YOU" > wgetter.sh | |
the one above creates appropriate script for wget for downloading videos | |
(MP4) and slides (PDF). | |
Please note the file type casing (MP4 vs. mp4) must match the casing of | |
the title (tooltip) of given category of files -- check the Lectures | |
page to find it out. | |
It is possible to pass file type in format "FileFormat=FileExtension", | |
so this script will look for one thing, but save as another. For example some courses list pdf files as "Slides". In such case pass such file format "Slides=pdf" -- this mean "Slides" will be grabbed, but saved with extension "pdf". | |
Some courses do not use consistent naming of tooltips (unfortunately), | |
in such case you can download files directly by extension -- add dot | |
(".") character in front of tile type. As previously, pay attention to | |
lowercase/uppercase (e.g. usually the extension is "mp4" but tooltip is | |
"MP4"). Example: | |
php c_get.php "https://class.coursera.org/scala/lecture/index" ".mp4 .pdf" "HERE&IS%MY&SESSION" > wgetter.sh | |
Yet another source of files are embedded frames (the ones when you click | |
to view lecture online). One of the advantages of this is ability to | |
download video in webm format. Instead of "." use now "~", for example: | |
php c_get.php "https://class.coursera.org/scala/lecture/index" "~webm .pdf" "HERE&IS%MY&SESSION" > wgetter.sh | |
NOTE: the video will be downloaded from embedded player, but handouts | |
(pdf) will be downloaded from download (resources) section. | |
If you would like to have notes in the "notes" subdirectory and lectures | |
in "lectures" one add "--split_dirs" argument in such way: | |
php c_get.php "https://class.coursera.org/scala/lecture/index" "mp4 pdf" "HERE&IS%MY&SESSION" --split_dirs="videos texts" > wgetter.sh | |
so "mp4" files will go into "videos" subdirectory and "pdf" files into | |
"texts" subdirectory. | |
If the directories with openining "Week X." seem redundant add | |
"--drop_week" option: | |
php c_get.php "https://class.coursera.org/scala/lecture/index" "mp4 pdf" "HERE&IS%MY&SESSION" --drop_week > wgetter.sh | |
Instead of having "02. Week 1: Functions & Evaluations" you will get | |
"02. Functions & Evaluations". | |
For courses which do not use natural order (from oldest to newest) there | |
is an option "reverse": | |
php c_get.php "https://class.coursera.org/scala/lecture/index" "mp4 pdf" "HERE&IS%MY&SESSION" --reverse > wgetter.sh | |
This will tell this script to use reversed order of numbering sections. | |
The courses with embedded videos are harder to process -- extraction | |
takes more time. If you know in advance that you don't want to extract | |
some portion of the lectures you can pass the limit option: | |
php c_get.php "https://class.coursera.org/scala/lecture/index" "mp4 pdf" "HERE&IS%MY&SESSION" --limit="Week 9" > wgetter.sh | |
This will start extraction from section containing phrase "Week 9". In | |
case of reversed order -- it will stop extraction on phrase "Week 9". | |
In all above examples, video lecture (mp4/webm) came first -- the | |
program assumes it is the main resource, and if it is missing it will | |
report this fact. It won't report missing resource of any other kind. | |
Once the actual getter script is created (here: wgetter.sh) you can pass | |
any extra option for "wget". For example you can run it as: | |
sh wgetter.sh --limit-rate=100k | |
This would limit speed of download to 100KB/s. See "man wget" for more | |
options. | |
* | |
SECURITY NOTE: | |
------------- | |
Do NOT share your session code with anyone, and this means -- do NOT | |
share the wget script with anyone as well! | |
================================================================== */ | |
require_once 'c_common.php'; | |
function print_wget($xpath,$session,$extensions,$extras) | |
{ | |
global $split_dirs_key,$drop_deco_key,$reverse_key,$beep_key,$debug_key,$limit_key; | |
global $debug_page; | |
$bash_printer = new BashPrinter(); | |
process_extra_arguments($extras); | |
// done with extra arguments --------------------------------------- | |
$downloads_filename = 'downloads.log'; | |
$downloads = array(); | |
if (file_exists($downloads_filename)) | |
$downloads = file($downloads_filename, FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES); | |
echo "ERRORS=0\n"; | |
$group_list = c_query_groups($xpath); | |
$group_count = array_key_exists($reverse_key,$extras) ? $group_list->length : 1; | |
if (array_key_exists($reverse_key,$extras)) | |
// in reverse order pretend limit was NOT hit | |
$limit_hit = false; | |
else | |
// if there is no limit given by user, pretend it was hit | |
$limit_hit = !array_key_exists($limit_key,$extras); | |
foreach ($group_list as $group) | |
{ | |
$item_count = 0; | |
$dir = c_query_dir($xpath,$group); | |
if (!$limit_hit && array_key_exists($limit_key,$extras)) | |
$limit_hit = (strpos($dir,$extras[$limit_key])!==FALSE); | |
$dir = c_deco_dir($dir,$group_count,array_key_exists($drop_deco_key,$extras)); | |
$group_count += array_key_exists($reverse_key,$extras) ? -1 : +1; | |
if (array_key_exists($limit_key,$extras)) | |
{ | |
if (array_key_exists($reverse_key,$extras)) | |
{ | |
if ($limit_hit) | |
break; | |
} | |
elseif (!$limit_hit) | |
continue; | |
} | |
$bash_printer->mkdir_print($dir,$extras); | |
$node_list = c_query_list($xpath,$group); | |
foreach ($node_list as $node) | |
{ | |
++$item_count; | |
c_query_row($xpath,$node,array_key_exists($drop_deco_key,$extras),$row,$title); | |
// each $ext_combo might be in such forms: either "FileType" or "FileType=FileExtension" (e.g. "PDF", "Slides=pdf") | |
for ($i_ext = 0; $i_ext < count($extensions); ++$i_ext) | |
{ | |
$ext_parts = explode('=',$extensions[$i_ext]); | |
if (array_key_exists($split_dirs_key,$extras)) | |
$target_dir = $extras[$split_dirs_key][$i_ext].'/'.$dir; | |
else | |
$target_dir = $dir; | |
$attr_extractor = 'href'; | |
if ($ext_parts[0][0]=='~') // extract link by extension from viewer frame | |
{ | |
$ext_parts[0] = substr($ext_parts[0],1); | |
$links = c_get_embedded_links($row,$ext_parts[0],$session); | |
if ($links===NULL) | |
{ | |
file_put_contents('php://stderr', "Loading embedded frame failed: '$dir/$title'\n"); | |
continue; | |
} | |
else if ($links->length===0 && $i_ext===0) | |
{ | |
file_put_contents('php://stderr', "No resources '$ext_parts[0]' found for '$dir/$title'\n"); | |
if (array_key_exists($debug_key,$extras)) | |
file_put_contents('DEBUG_'.$title,$debug_page); | |
continue; | |
/* $links = c_get_embedded_links2($row,$ext_parts[0],$session); | |
if ($links===NULL) | |
{ | |
file_put_contents('php://stderr', "Loading fallback embedded frame failed: '$dir/$title'\n"); | |
continue; | |
} | |
else if ($links->length===0 && $i_ext===0) | |
{ | |
file_put_contents('php://stderr', "No fallback resources '$ext_parts[0]' found for '$dir/$title'\n"); | |
if (array_key_exists($debug_key,$extras)) | |
file_put_contents('DEBUG_'.$title,$debug_page); | |
continue; | |
} | |
else | |
$attr_extractor = 'flashvars';*/ | |
} | |
else | |
$attr_extractor = 'src'; | |
if (array_key_exists($debug_key,$extras)) | |
file_put_contents('php://stderr', "For $dir/$title ".$links->length." '$ext_parts[0]' links found.\n"); | |
} | |
else if ($ext_parts[0][0]=='.') // extract link by extension of the linked file | |
{ | |
$links = $xpath->query('.//div[@class="course-lecture-item-resource"]/a[contains(@href,"'.$ext_parts[0].'")]',$node); | |
$ext_parts[0] = substr($ext_parts[0],1); | |
} | |
else // extract link by tooltip of the link | |
$links = $xpath->query('.//a[contains(@title,"'.$ext_parts[0].'")]',$node); | |
$match = FALSE; | |
foreach ($links as $link) | |
{ | |
$suffix = ''; | |
if ($links->length>1) | |
$suffix = '.'.fix_filename($link->attributes->getNamedItem('title')->nodeValue); | |
$link = $link->attributes->getNamedItem($attr_extractor)->nodeValue; | |
if ($attr_extractor=='flashvars') | |
{ | |
$url_idx = strpos($link,'&file=http'); | |
$link = urldecode(substr($link,$url_idx+strlen('&file='))); | |
} | |
if (!in_array($link,$downloads)) | |
{ | |
$target_filename = $target_dir.'/'.str_pad($item_count,3,'0',STR_PAD_LEFT).'. '.$title.$suffix.'.'.strtolower(end(array_values($ext_parts))); | |
$bash_printer->wget_file_print($link,$target_filename,$downloads_filename,$session); | |
$match = TRUE; | |
} | |
else if (array_key_exists($debug_key,$extras)) | |
file_put_contents('php://stderr', "$dir/$title '$ext_parts[0]' already downloaded.\n"); | |
} | |
} | |
} | |
} | |
echo "\n"; | |
echo 'if [ $ERRORS -ne 0 ] ; then echo "There were some errors while downloading. Run the script again." ; fi'."\n"; | |
if (array_key_exists($beep_key,$extras)) | |
echo "beep\n"; | |
} | |
if ($argc<4) | |
{ | |
file_put_contents('php://stderr', "Error: you should input minimum three arguments, the usage is:\n"); | |
file_put_contents('php://stderr', "\"LECTURES_URL\" \"FILE_TYPES\" \"SESSION_CODE\" [--$beep_key] [--$reverse_key] [--$drop_deco_key] [--$split_dirs_key=\"directories per file type\"]\n"); | |
} | |
else | |
{ | |
array_shift($argv); | |
$url = array_shift($argv); | |
$extensions = explode(' ',array_shift($argv)); | |
$session = array_shift($argv); | |
$extras = array(); | |
foreach ($argv as $a) | |
{ | |
$parts = explode('=',$a); | |
if (!in_array($parts[0],array($split_dirs_key,$drop_deco_key,$reverse_key,$beep_key,$debug_key,$limit_key))) | |
{ | |
file_put_contents('php://stderr', 'Unknown extra argument "'.$parts[0]."\"\n"); | |
exit(1); | |
} | |
$extras[$parts[0]] = count($parts)==1 ? NULL : $parts[1]; | |
} | |
$xpath = get_page_xpath($url,$session); | |
if ($xpath!==NULL) | |
print_wget($xpath,$session,$extensions,$extras); | |
} | |
?> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/* ===================== COURSERA PREVIEW GETTER ======================= | |
tags: [coursera video download] [coursera lecture download] | |
CHANGELOG: | |
--------- | |
2013-07-15 * if there is nothing to download for given folder | |
it is not created | |
2013-06-29 * technical change to work with c_common | |
2013-05-15 * reporting failed download or extraction of main | |
resource | |
2013-05-03 * just keeping in sync with c_get | |
2013-04-22 * dropping a week/lecture phrase from filenames as well | |
2013-04-15 * update to follow last Coursera changes | |
2012-10-12 * sharing code with c_common.php; | |
some fixes to follow Coursera changes of preview sites | |
2012-07-05 * initial release | |
WHAT IT DOES: | |
------------ | |
* it is counterpart for Coursera getter, but this one works only for course previews | |
-- the ones with embedded video player, and nothing else | |
WHAT YOU NEED: | |
------------- | |
1. proper shell (Windows users -- of course I recommend switching to Linux entirely, but as a workaround Cygwin should be fine -- I don't know how about the tools I mention below) | |
2. wget (in openSUSE `sudo zypper in wget`) | |
3. php5 (in openSUSE `sudo zypper in php5`) | |
4. php5-mbstring (in openSUSE `sudo zypper in php5-mbstring`) | |
USAGE: | |
----- | |
php c_preview.php "link_to_preview_page" "video_file_type" > wget_script_name.sh | |
sh wget_script_name.sh | |
Example (this is one line): | |
php c_preview.php "https://class.coursera.org/crypto-preview/lecture/index" "mp4" | |
the one above creates appropriate script for wget for downloading videos (MP4). Now execute | |
sh wgetter.sh | |
Please note the file type is not guaranteed to exists on the server | |
(so far "webm" and "mp4" are supported by Coursera). | |
================================================================== */ | |
require_once 'c_common.php'; | |
// https://class.coursera.org/machlearning-001/lecture/preview/index | |
function print_wget($xpath,$ext,$extras) | |
{ | |
global $split_dirs_key,$drop_deco_key,$extension_key,$reverse_key,$beep_key; | |
$bash_printer = new BashPrinter(); | |
process_extra_arguments($extras); | |
$group_count = 0; | |
echo "ERRORS=0\n"; | |
$group_list = c_query_groups($xpath); | |
$group_count = array_key_exists($reverse_key,$extras) ? $group_list->length : 1; | |
foreach ($group_list as $group) | |
{ | |
$item_count = 0; | |
$dir = c_deco_dir(c_query_dir($xpath,$group),$group_count,array_key_exists($drop_deco_key,$extras)); | |
$group_count += array_key_exists($reverse_key,$extras) ? -1 : +1; | |
$bash_printer->mkdir_print($dir,$extras); | |
// get the list of all lectures within current group (week) | |
$node_list = c_query_list($xpath,$group); | |
foreach ($node_list as $node) | |
{ | |
++$item_count; | |
c_query_row($xpath,$node,array_key_exists($drop_deco_key,$extras),$row,$title); | |
$video_list = c_get_embedded_links($row,$ext); | |
if ($video_list===NULL) | |
{ | |
file_put_contents('php://stderr', "Loading embedded frame failed: '$dir/$title'\n"); | |
continue; | |
} | |
else if ($video_list->length==0) | |
{ | |
file_put_contents('php://stderr', "Filetype '$ext' not found for '".$title."'\n"); | |
continue; | |
} | |
$video = $video_list->item(0); | |
$vid_src = $video->attributes->getNamedItem('src')->nodeValue; | |
$bash_printer->wget_file_print($vid_src,$dir.'/'.str_pad($item_count,3,'0',STR_PAD_LEFT).'. '.$title.'.'.strtolower($ext)); | |
} | |
} | |
echo 'if [ $ERRORS -ne 0 ] ; then echo "There were some errors while downloading. Run the script again." ; fi'."\n"; | |
if (array_key_exists($beep_key,$extras)) | |
echo "beep\n"; | |
} | |
if ($argc<3) | |
{ | |
file_put_contents('php://stderr', "Error: you should input minimum two arguments, the usage is:\n"); | |
file_put_contents('php://stderr', "\"LECTURES_URL\" \"FILE_TYPES\" [--$beep_key] [--$reverse_key] [--$drop_deco_key] [--$split_dirs_key=\"directories per file type\"]\n"); | |
} | |
else | |
{ | |
array_shift($argv); | |
$url = array_shift($argv); | |
$extensions = array_shift($argv); | |
$extras = array(); | |
foreach ($argv as $a) | |
{ | |
$parts = explode('=',$a); | |
if (!in_array($parts[0],array($split_dirs_key,$drop_deco_key,$reverse_key,$beep_key))) | |
{ | |
file_put_contents('php://stderr', 'Unknown extra argument "'.$parts[0]."\"\n"); | |
exit(1); | |
} | |
$extras[$parts[0]] = count($parts)==1 ? NULL : $parts[1]; | |
} | |
$xpath = get_page_xpath($url); | |
if ($xpath!==NULL) | |
print_wget($xpath,$extensions,$extras); | |
} | |
?> |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment