Skip to content

Instantly share code, notes, and snippets.

@mekler
Last active January 2, 2016 21:39
Show Gist options
  • Save mekler/8364712 to your computer and use it in GitHub Desktop.
Save mekler/8364712 to your computer and use it in GitHub Desktop.
Scrapeo de Snie de la SEP (Primera parte)
<?php
// Defining the basic cURL function
function curl($url, $postArray=NULL, $srcPage=NULL) {
$postFields="";
$ch = curl_init($url); // Initialising cURL
curl_setopt($ch, CURLOPT_URL, $url); // Setting cURL's URL option with the $url variable passed into the function
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); // Setting cURL's option to return the webpage data
curl_setopt($ch, CURLOPT_ENCODING, 'identity');
curl_setopt($ch, CURLOPT_COOKIEJAR, 'cookie.txt');
curl_setopt($ch, CURLOPT_COOKIEFILE, 'cookie.txt');
curl_setopt($ch, CURLOPT_HTTPHEADER, array(
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding: gzip, deflate',
'Accept-Language: en-US,en;q=0.5',
'Cache-Control: no-cache',
'Connection: keep-alive',
'Content-Type: application/x-www-form-urlencoded; charset=utf-8',
'Host: www.snie.sep.gob.mx',
'Pragma: no-cache',
'Referer: http://www.snie.sep.gob.mx/SNIESC/',
'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:25.0) Gecko/20100101 Firefox/25.0',
'X-MicrosoftAjax: Delta=true',
'X-Requested-With: XMLHttpRequest'
)
);
if (count($srcPage)>0){
$tabla = simplexml_load_string($srcPage);
if($tabla){
$eventvalidation = $tabla->xpath('//*[@id="__EVENTVALIDATION"]');
$val_eventValidation = $eventvalidation[0]['value'];
$viewstate = $tabla->xpath('//*[@id="__VIEWSTATE"]');
$val_viewstate = $viewstate[0]['value'];
$postFields='__EVENTVALIDATION='.urlencode($val_eventValidation).'&__VIEWSTATE='.urlencode($val_viewstate);
}else{
$pattern = '/<input[^>]+name="(__\w+)"[^>]+value="([^"]*)"[^>]+>/';
if(preg_match_all($pattern, $srcPage, $matches)){
$n = count($matches[0]);
for ($i=0;$i<$n;$i++){
$key = $matches[1][$i];
if(!isset($postArray[$key])){
if(strlen($postFields)>0){
$postFields= $postFields . "&" . $matches[1][$i] . "=" . urlencode($matches[2][$i]);
}else{
$postFields= $matches[1][$i] . "=" . urlencode($matches[2][$i]);
}
}
}
}else{
echo "NOT MATCHING";
}
}
}
if (count($postArray)>0){
foreach ($postArray as $key => $value){
if (strlen($postFields)>0){
$postFields=$postFields . "&" . $key . "=" . $value;
}else{
$postFields=$key . "=" . $value;
}
}
curl_setopt($ch, CURLOPT_POST,1);
}
curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields);
$data = curl_exec($ch); // Executing the cURL request and assigning the returned data to the $data variable
curl_close($ch); // Closing cURL
return $data; // Returning the data from the function
}
#================================================ IMPLEMENTACION =========================================================
$estado = $argv[1];
#Cambiar los settings de la base de datos
$link = mysqli_connect('localhost','user','contrasena','base_de_datos');
$fp = fopen('/mnt/cbsvolume1/scrape-snie/salida/'.$estado.'.csv', 'w');
$result = mysqli_query($link,'select cct from escuelas where entidad='.$estado);
while ($row = $result->fetch_assoc()) {
$url = "http://www.snie.sep.gob.mx/SNIESC/";
$data = curl($url);
$cct = $row['cct'];
$opt=array("Button4"=>"BUSCAR",
"DropDownList1"=>"00",
"DropDownList4"=>"00",
"DropDownList7"=>"0",
"DropDownList8"=>"0",
"DropDownList9"=>"00",
"RadioButtonList1"=>"2",
"ScriptManager1"=>urlencode("UpdatePanel4|Timer1"),
"TextBox2"=>"",
"__ASYNCPOST"=>"true",
"TextBox3"=>$cct);
$data = curl($url, $opt, $data);
$pattern = '/href="([^"]*)"/';
preg_match_all($pattern, $data, $matches);
$n = count( $matches );
var_dump($n,$matches);
$flag = false;
for($i=1;$i<$n;$i++){
$k = count( $matches[$i] );
if( $k>0 )
$flag = true;
for($j=0;$j<$k;$j++){
fputcsv($fp, array($cct, $matches[$i][$j], $i,$j) );
}
}
if(!$flag)
fputcsv($fp, array($cct, 'N/A','-1') );
}
mysqli_close($link);
fclose($fp);
?>
#!/bin/bash
for i in {1..32}
do
php scrape-snie/primer_estado.php $i
done
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment