Skip to content

Instantly share code, notes, and snippets.

@alonsoir
Last active February 28, 2025 12:50
Show Gist options
  • Save alonsoir/3b102963a902610ab1169fb5ab419908 to your computer and use it in GitHub Desktop.
Save alonsoir/3b102963a902610ab1169fb5ab419908 to your computer and use it in GitHub Desktop.
Una herramienta para hacer scrapping de ficheros sobre una url. Multihilo, multiproceso, multiplataforma (Linux/OSX). En progreso, aún no es capaz de descender y recorrer toda el arbol DOM para encontrar enlaces.

time gcc -o scraper scrapper_multi_thread.c -lcurl -pthread -Wall -Wextra -O2 gcc -o scraper scrapper_multi_thread.c -lcurl -pthread -Wall -Wextra -O2 0.17s user 0.08s system 104% cpu 0.239 total

time ./scraper https://www.omg.org 3 4 [MAIN] Hilo 0 creado [MAIN] Hilo 1 creado [HILO 123145548595200] Iniciado [HILO 123145549131776] Iniciado [HILO 123145548595200] Esperando URLs... [HILO 123145549668352] Iniciado [HILO 123145549131776] Esperando URLs... [HILO 123145549668352] Esperando URLs... [MAIN] Hilo 2 creado [MAIN] Hilo 3 creado [COLA] Encolada https://www.omg.org (profundidad 0, total 1) [COLA] Enviando broadcast a los hilos [HILO 123145550204928] Iniciado [COLA] Desencolada https://www.omg.org (restan 0) [HILO 123145550204928] Procesando https://www.omg.org (profundidad 0) [HILO 123145548595200] Esperando URLs... [HILO 123145549668352] Esperando URLs... [HILO 123145549131776] Esperando URLs... [DESCARGA] HTML de https://www.omg.org descargado en 0.117 s [MEMORIA] Uso actual: 3710976 KB [HILO 123145550204928] Esperando URLs... [MAIN] Finalizando procesamiento [HILO 123145548595200] Terminando (sin URLs y procesamiento finalizado) [HILO 123145550204928] Terminando (sin URLs y procesamiento finalizado) [HILO 123145549131776] Terminando (sin URLs y procesamiento finalizado) [HILO 123145549668352] Terminando (sin URLs y procesamiento finalizado) [MAIN] Hilo 0 terminado [MAIN] Hilo 1 terminado [MAIN] Hilo 2 terminado [MAIN] Hilo 3 terminado [INFO] Proceso completado en 60.010 s. URLs procesadas: 0 [MEMORIA] Uso actual: 3710976 KB ./scraper https://www.omg.org 3 4 0.01s user 0.01s system 0% cpu 1:00.32 total

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <curl/curl.h>
#include <regex.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <stdbool.h>
#ifdef __linux__
#include <sys/sysinfo.h>
#elif defined(__APPLE__)
#include <sys/sysctl.h>
#endif
#define RED "\x1b[31m"
#define GREEN "\x1b[32m"
#define YELLOW "\x1b[33m"
#define BLUE "\x1b[34m"
#define RESET "\x1b[0m"
#define FILE_PATTERN "href=\"([^\"]+\\.(pdf|txt|md|doc|rtf))\""
#define URL_PATTERN "href=\"([^\"]+)\""
#define MAX_DEPTH 10
#define MAX_URLS 1000
pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t queue_cv = PTHREAD_COND_INITIALIZER;
struct URLNode {
char *url;
int depth;
struct URLNode *next;
};
struct URLQueue {
struct URLNode *front, *rear;
int count;
} url_queue = {NULL, NULL, 0};
int processed_urls = 0;
bool processing = true;
double get_time() {
struct timeval tv;
gettimeofday(&tv, NULL);
return tv.tv_sec + tv.tv_usec / 1000000.0;
}
void print_memory_usage() {
struct rusage usage;
getrusage(RUSAGE_SELF, &usage);
printf(YELLOW "[MEMORIA] Uso actual: %ld KB\n" RESET, usage.ru_maxrss);
}
int get_available_threads() {
#ifdef __linux__
return get_nprocs();
#elif defined(__APPLE__)
int numCPU;
size_t size = sizeof(numCPU);
sysctlbyname("hw.ncpu", &numCPU, &size, NULL, 0);
return numCPU;
#else
return sysconf(_SC_NPROCESSORS_ONLN);
#endif
}
void enqueue(const char *url, int depth) {
struct URLNode *new_node = malloc(sizeof(struct URLNode));
if (!new_node) {
fprintf(stderr, RED "[ERROR] No se pudo asignar memoria para nodo\n" RESET);
return;
}
new_node->url = strdup(url);
if (!new_node->url) {
free(new_node);
fprintf(stderr, RED "[ERROR] No se pudo duplicar URL\n" RESET);
return;
}
new_node->depth = depth;
new_node->next = NULL;
pthread_mutex_lock(&queue_mutex);
if (url_queue.count >= MAX_URLS) {
printf(YELLOW "[WARN] Límite de URLs alcanzado (%d)\n" RESET, MAX_URLS);
free(new_node->url);
free(new_node);
} else {
if (!url_queue.rear) url_queue.front = url_queue.rear = new_node;
else url_queue.rear->next = new_node, url_queue.rear = new_node;
url_queue.count++;
printf(GREEN "[COLA] Encolada %s (profundidad %d, total %d)\n" RESET, url, depth, url_queue.count);
}
printf(YELLOW "[COLA] Enviando broadcast a los hilos\n" RESET);
pthread_cond_broadcast(&queue_cv);
pthread_mutex_unlock(&queue_mutex);
}
struct URLNode *dequeue() {
if (!url_queue.front) {
printf(YELLOW "[COLA] Cola vacía\n" RESET);
return NULL;
}
struct URLNode *node = url_queue.front;
url_queue.front = node->next;
if (!url_queue.front) url_queue.rear = NULL;
url_queue.count--;
printf(BLUE "[COLA] Desencolada %s (restan %d)\n" RESET, node->url, url_queue.count);
return node;
}
size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, char **userp) {
size_t realSize = size * nmemb;
*userp = realloc(*userp, realSize + 1);
if (!*userp) {
fprintf(stderr, RED "[ERROR] Fallo en realloc\n" RESET);
return 0;
}
memcpy(*userp, contents, realSize);
(*userp)[realSize] = '\0';
return realSize;
}
char *fetch_html(const char *url) {
double start = get_time();
CURL *curl = curl_easy_init();
if (!curl) {
printf(RED "[ERROR] Fallo al inicializar CURL\n" RESET);
return NULL;
}
char *response = malloc(1);
if (!response) {
curl_easy_cleanup(curl);
return NULL;
}
response[0] = '\0';
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MyScraper/1.0)");
CURLcode res = curl_easy_perform(curl);
double end = get_time();
if (res == CURLE_OK) {
printf(GREEN "[DESCARGA] HTML de %s descargado en %.3f s\n" RESET, url, end - start);
} else {
printf(RED "[ERROR] Fallo al descargar %s: %s\n" RESET, url, curl_easy_strerror(res));
free(response);
response = NULL;
}
curl_easy_cleanup(curl);
return response;
}
void download_file(const char *url) {
double start = get_time();
CURL *curl = curl_easy_init();
if (!curl) return;
char filename[256];
snprintf(filename, sizeof(filename), "downloaded_%d_%s", processed_urls, strrchr(url, '/') ? strrchr(url, '/') + 1 : "file");
FILE *fp = fopen(filename, "wb");
if (!fp) {
printf(RED "[ERROR] No se pudo abrir %s para escritura\n" RESET, filename);
curl_easy_cleanup(curl);
return;
}
curl_easy_setopt(curl, CURLOPT_URL, url);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MyScraper/1.0)");
CURLcode res = curl_easy_perform(curl);
double end = get_time();
fclose(fp);
if (res == CURLE_OK) {
printf(GREEN "[ARCHIVO] Descargado %s como %s en %.3f s\n" RESET, url, filename, end - start);
} else {
printf(RED "[ERROR] Fallo al descargar archivo %s: %s\n" RESET, url, curl_easy_strerror(res));
remove(filename);
}
curl_easy_cleanup(curl);
}
void extract_links(char *html, int depth) {
regex_t url_regex, file_regex;
if (regcomp(&url_regex, URL_PATTERN, REG_EXTENDED) || regcomp(&file_regex, FILE_PATTERN, REG_EXTENDED)) {
printf(RED "[ERROR] Fallo al compilar regex\n" RESET);
return;
}
char *cursor = html;
regmatch_t matches[3];
while (!regexec(&url_regex, cursor, 2, matches, 0)) {
int len = matches[1].rm_eo - matches[1].rm_so;
char *url = strndup(cursor + matches[1].rm_so, len);
if (depth + 1 <= MAX_DEPTH) {
if (!regexec(&file_regex, url, 3, matches, 0)) {
download_file(url);
processed_urls++;
} else {
enqueue(url, depth + 1);
}
}
free(url);
cursor += matches[0].rm_eo; // Corregido: Usar el fin de la coincidencia completa
}
regfree(&url_regex);
regfree(&file_regex);
}
void *worker_thread(void *unused) {
(void)unused;
printf(BLUE "[HILO %lu] Iniciado\n" RESET, (unsigned long)pthread_self());
while (true) {
struct URLNode *task = NULL;
pthread_mutex_lock(&queue_mutex);
while (!url_queue.front && processing) {
printf(YELLOW "[HILO %lu] Esperando URLs...\n" RESET, (unsigned long)pthread_self());
pthread_cond_wait(&queue_cv, &queue_mutex);
}
if (url_queue.front) {
task = dequeue();
} else if (!processing) {
printf(BLUE "[HILO %lu] Terminando (sin URLs y procesamiento finalizado)\n" RESET, (unsigned long)pthread_self());
pthread_mutex_unlock(&queue_mutex);
break;
}
pthread_mutex_unlock(&queue_mutex);
if (task) {
printf(BLUE "[HILO %lu] Procesando %s (profundidad %d)\n" RESET, (unsigned long)pthread_self(), task->url, task->depth);
char *html = fetch_html(task->url);
if (html) {
extract_links(html, task->depth);
free(html);
}
free(task->url);
free(task);
print_memory_usage();
}
}
return NULL;
}
int main(int argc, char *argv[]) {
if (argc != 4) {
printf(RED "Uso: %s [URL] [PROFUNDIDAD] [HILOS]\n" RESET, argv[0]);
return 1;
}
double start_time = get_time();
curl_global_init(CURL_GLOBAL_ALL);
int depth = (strcmp(argv[2], "auto") == 0) ? 5 : atoi(argv[2]);
if (depth < 0 || depth > MAX_DEPTH) depth = MAX_DEPTH;
int num_threads = (strcmp(argv[3], "auto") == 0) ? get_available_threads() : atoi(argv[3]);
pthread_t threads[num_threads];
for (int i = 0; i < num_threads; i++) {
if (pthread_create(&threads[i], NULL, worker_thread, NULL) != 0) {
printf(RED "[ERROR] Fallo al crear hilo %d\n" RESET, i);
} else {
printf(GREEN "[MAIN] Hilo %d creado\n" RESET, i);
}
}
enqueue(argv[1], 0); // Encolar y señalizar
sleep(60);
pthread_mutex_lock(&queue_mutex);
processing = false;
printf(YELLOW "[MAIN] Finalizando procesamiento\n" RESET);
pthread_cond_broadcast(&queue_cv);
pthread_mutex_unlock(&queue_mutex);
for (int i = 0; i < num_threads; i++) {
pthread_join(threads[i], NULL);
printf(GREEN "[MAIN] Hilo %d terminado\n" RESET, i);
}
double end_time = get_time();
printf(GREEN "[INFO] Proceso completado en %.3f s. URLs procesadas: %d\n" RESET, end_time - start_time, processed_urls);
print_memory_usage();
curl_global_cleanup();
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment