Last active
January 2, 2016 21:39
-
-
Save mekler/8364712 to your computer and use it in GitHub Desktop.
Scrapeo de Snie de la SEP (Primera parte)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Defining the basic cURL function | |
function curl($url, $postArray=NULL, $srcPage=NULL) { | |
$postFields=""; | |
$ch = curl_init($url); // Initialising cURL | |
curl_setopt($ch, CURLOPT_URL, $url); // Setting cURL's URL option with the $url variable passed into the function | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // Setting cURL's option to return the webpage data | |
curl_setopt($ch, CURLOPT_ENCODING, 'identity'); | |
curl_setopt($ch, CURLOPT_COOKIEJAR, 'cookie.txt'); | |
curl_setopt($ch, CURLOPT_COOKIEFILE, 'cookie.txt'); | |
curl_setopt($ch, CURLOPT_HTTPHEADER, array( | |
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
'Accept-Encoding: gzip, deflate', | |
'Accept-Language: en-US,en;q=0.5', | |
'Cache-Control: no-cache', | |
'Connection: keep-alive', | |
'Content-Type: application/x-www-form-urlencoded; charset=utf-8', | |
'Host: www.snie.sep.gob.mx', | |
'Pragma: no-cache', | |
'Referer: http://www.snie.sep.gob.mx/SNIESC/', | |
'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0', | |
'X-MicrosoftAjax: Delta=true', | |
'X-Requested-With: XMLHttpRequest' | |
) | |
); | |
if (count($srcPage)>0){ | |
$tabla = simplexml_load_string($srcPage); | |
if($tabla){ | |
$eventvalidation = $tabla->xpath('//*[@id="__EVENTVALIDATION"]'); | |
$val_eventValidation = $eventvalidation[0]['value']; | |
$viewstate = $tabla->xpath('//*[@id="__VIEWSTATE"]'); | |
$val_viewstate = $viewstate[0]['value']; | |
$postFields='__EVENTVALIDATION='.urlencode($val_eventValidation).'&__VIEWSTATE='.urlencode($val_viewstate); | |
}else{ | |
$pattern = '/<input[^>]+name="(__\w+)"[^>]+value="([^"]*)"[^>]+>/'; | |
if(preg_match_all($pattern, $srcPage, $matches)){ | |
$n = count($matches[0]); | |
for ($i=0;$i<$n;$i++){ | |
$key = $matches[1][$i]; | |
if(!isset($postArray[$key])){ | |
if(strlen($postFields)>0){ | |
$postFields= $postFields . "&" . $matches[1][$i] . "=" . urlencode($matches[2][$i]); | |
}else{ | |
$postFields= $matches[1][$i] . "=" . urlencode($matches[2][$i]); | |
} | |
} | |
} | |
}else{ | |
echo "NOT MATCHING"; | |
} | |
} | |
} | |
if (count($postArray)>0){ | |
foreach ($postArray as $key => $value){ | |
if (strlen($postFields)>0){ | |
$postFields=$postFields . "&" . $key . "=" . $value; | |
}else{ | |
$postFields=$key . "=" . $value; | |
} | |
} | |
curl_setopt($ch, CURLOPT_POST,1); | |
} | |
curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields); | |
$data = curl_exec($ch); // Executing the cURL request and assigning the returned data to the $data variable | |
curl_close($ch); // Closing cURL | |
return $data; // Returning the data from the function | |
} | |
#================================================ IMPLEMENTACION ========================================================= | |
$estado = $argv[1]; | |
#Cambiar los settings de la base de datos | |
$link = mysqli_connect('localhost','user','contrasena','base_de_datos'); | |
$fp = fopen('/mnt/cbsvolume1/scrape-snie/salida/'.$estado.'.csv', 'w'); | |
$result = mysqli_query($link,'select cct from escuelas where entidad='.$estado); | |
while ($row = $result->fetch_assoc()) { | |
$url = "http://www.snie.sep.gob.mx/SNIESC/"; | |
$data = curl($url); | |
$cct = $row['cct']; | |
$opt=array("Button4"=>"BUSCAR", | |
"DropDownList1"=>"00", | |
"DropDownList4"=>"00", | |
"DropDownList7"=>"0", | |
"DropDownList8"=>"0", | |
"DropDownList9"=>"00", | |
"RadioButtonList1"=>"2", | |
"ScriptManager1"=>urlencode("UpdatePanel4|Timer1"), | |
"TextBox2"=>"", | |
"__ASYNCPOST"=>"true", | |
"TextBox3"=>$cct); | |
$data = curl($url, $opt, $data); | |
$pattern = '/href="([^"]*)"/'; | |
preg_match_all($pattern, $data, $matches); | |
$n = count( $matches ); | |
var_dump($n,$matches); | |
$flag = false; | |
for($i=1;$i<$n;$i++){ | |
$k = count( $matches[$i] ); | |
if( $k>0 ) | |
$flag = true; | |
for($j=0;$j<$k;$j++){ | |
fputcsv($fp, array($cct, $matches[$i][$j], $i,$j) ); | |
} | |
} | |
if(!$flag) | |
fputcsv($fp, array($cct, 'N/A','-1') ); | |
} | |
mysqli_close($link); | |
fclose($fp); | |
?> |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
for i in {1..32} | |
do | |
php scrape-snie/primer_estado.php $i | |
done |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment