alonsoir · February 28, 2025 12:50
diff --git a/output.md b/output.md
diff --git a/scrapper.c b/scrapper.c
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <curl/curl.h>
 #include <regex.h>
 #include <pthread.h>
 #include <unistd.h>
 #include <sys/time.h>
 #include <sys/resource.h>
 #include <stdbool.h>

 #ifdef __linux__
 #include <sys/sysinfo.h>
 #elif defined(__APPLE__)
 #include <sys/sysctl.h>
 #endif

 #define RED     "\x1b[31m"
 #define GREEN   "\x1b[32m"
 #define YELLOW  "\x1b[33m"
 #define BLUE    "\x1b[34m"
 #define RESET   "\x1b[0m"

 #define FILE_PATTERN "href=\"([^\"]+\\.(pdf|txt|md|doc|rtf))\""
 #define URL_PATTERN  "href=\"([^\"]+)\""
 #define MAX_DEPTH    10
 #define MAX_URLS     1000

 pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER;
 pthread_cond_t queue_cv = PTHREAD_COND_INITIALIZER;

 struct URLNode {
    char *url;
    int depth;
    struct URLNode *next;
 };

 struct URLQueue {
    struct URLNode *front, *rear;
    int count;
 } url_queue = {NULL, NULL, 0};

 int processed_urls = 0;
 bool processing = true;

 double get_time() {
    struct timeval tv;
    gettimeofday(&tv, NULL);
    return tv.tv_sec + tv.tv_usec / 1000000.0;
 }

 void print_memory_usage() {
    struct rusage usage;
    getrusage(RUSAGE_SELF, &usage);
    printf(YELLOW "[MEMORIA] Uso actual: %ld KB\n" RESET, usage.ru_maxrss);
 }

 int get_available_threads() {
 #ifdef __linux__
    return get_nprocs();
 #elif defined(__APPLE__)
    int numCPU;
    size_t size = sizeof(numCPU);
    sysctlbyname("hw.ncpu", &numCPU, &size, NULL, 0);
    return numCPU;
 #else
    return sysconf(_SC_NPROCESSORS_ONLN);
 #endif
 }

 void enqueue(const char *url, int depth) {
    struct URLNode *new_node = malloc(sizeof(struct URLNode));
    if (!new_node) {
        fprintf(stderr, RED "[ERROR] No se pudo asignar memoria para nodo\n" RESET);
        return;
    }
    new_node->url = strdup(url);
    if (!new_node->url) {
        free(new_node);
        fprintf(stderr, RED "[ERROR] No se pudo duplicar URL\n" RESET);
        return;
    }
    new_node->depth = depth;
    new_node->next = NULL;

    pthread_mutex_lock(&queue_mutex);
    if (url_queue.count >= MAX_URLS) {
        printf(YELLOW "[WARN] Límite de URLs alcanzado (%d)\n" RESET, MAX_URLS);
        free(new_node->url);
        free(new_node);
    } else {
        if (!url_queue.rear) url_queue.front = url_queue.rear = new_node;
        else url_queue.rear->next = new_node, url_queue.rear = new_node;
        url_queue.count++;
        printf(GREEN "[COLA] Encolada %s (profundidad %d, total %d)\n" RESET, url, depth, url_queue.count);
    }
    printf(YELLOW "[COLA] Enviando broadcast a los hilos\n" RESET);
    pthread_cond_broadcast(&queue_cv);
    pthread_mutex_unlock(&queue_mutex);
 }

 struct URLNode *dequeue() {
    if (!url_queue.front) {
        printf(YELLOW "[COLA] Cola vacía\n" RESET);
        return NULL;
    }
    struct URLNode *node = url_queue.front;
    url_queue.front = node->next;
    if (!url_queue.front) url_queue.rear = NULL;
    url_queue.count--;
    printf(BLUE "[COLA] Desencolada %s (restan %d)\n" RESET, node->url, url_queue.count);
    return node;
 }

 size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, char **userp) {
    size_t realSize = size * nmemb;
    *userp = realloc(*userp, realSize + 1);
    if (!*userp) {
        fprintf(stderr, RED "[ERROR] Fallo en realloc\n" RESET);
        return 0;
    }
    memcpy(*userp, contents, realSize);
    (*userp)[realSize] = '\0';
    return realSize;
 }

 char *fetch_html(const char *url) {
    double start = get_time();
    CURL *curl = curl_easy_init();
    if (!curl) {
        printf(RED "[ERROR] Fallo al inicializar CURL\n" RESET);
        return NULL;
    }
    char *response = malloc(1);
    if (!response) {
        curl_easy_cleanup(curl);
        return NULL;
    }
    response[0] = '\0';

    curl_easy_setopt(curl, CURLOPT_URL, url);
    curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
    curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
    curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
    curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
    curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MyScraper/1.0)");

    CURLcode res = curl_easy_perform(curl);
    double end = get_time();
    if (res == CURLE_OK) {
        printf(GREEN "[DESCARGA] HTML de %s descargado en %.3f s\n" RESET, url, end - start);
    } else {
        printf(RED "[ERROR] Fallo al descargar %s: %s\n" RESET, url, curl_easy_strerror(res));
        free(response);
        response = NULL;
    }
    curl_easy_cleanup(curl);
    return response;
 }

 void download_file(const char *url) {
    double start = get_time();
    CURL *curl = curl_easy_init();
    if (!curl) return;

    char filename[256];
    snprintf(filename, sizeof(filename), "downloaded_%d_%s", processed_urls, strrchr(url, '/') ? strrchr(url, '/') + 1 : "file");

    FILE *fp = fopen(filename, "wb");
    if (!fp) {
        printf(RED "[ERROR] No se pudo abrir %s para escritura\n" RESET, filename);
        curl_easy_cleanup(curl);
        return;
    }

    curl_easy_setopt(curl, CURLOPT_URL, url);
    curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
    curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MyScraper/1.0)");

    CURLcode res = curl_easy_perform(curl);
    double end = get_time();
    fclose(fp);
    if (res == CURLE_OK) {
        printf(GREEN "[ARCHIVO] Descargado %s como %s en %.3f s\n" RESET, url, filename, end - start);
    } else {
        printf(RED "[ERROR] Fallo al descargar archivo %s: %s\n" RESET, url, curl_easy_strerror(res));
        remove(filename);
    }
    curl_easy_cleanup(curl);
 }

 void extract_links(char *html, int depth) {
    regex_t url_regex, file_regex;
    if (regcomp(&url_regex, URL_PATTERN, REG_EXTENDED) || regcomp(&file_regex, FILE_PATTERN, REG_EXTENDED)) {
        printf(RED "[ERROR] Fallo al compilar regex\n" RESET);
        return;
    }

    char *cursor = html;
    regmatch_t matches[3];
    while (!regexec(&url_regex, cursor, 2, matches, 0)) {
        int len = matches[1].rm_eo - matches[1].rm_so;
        char *url = strndup(cursor + matches[1].rm_so, len);
        if (depth + 1 <= MAX_DEPTH) {
            if (!regexec(&file_regex, url, 3, matches, 0)) {
                download_file(url);
                processed_urls++;
            } else {
                enqueue(url, depth + 1);
            }
        }
        free(url);
        cursor += matches[0].rm_eo; // Corregido: Usar el fin de la coincidencia completa
    }
    regfree(&url_regex);
    regfree(&file_regex);
 }

 void *worker_thread(void *unused) {
    (void)unused;
    printf(BLUE "[HILO %lu] Iniciado\n" RESET, (unsigned long)pthread_self());
    while (true) {
        struct URLNode *task = NULL;

        pthread_mutex_lock(&queue_mutex);
        while (!url_queue.front && processing) {
            printf(YELLOW "[HILO %lu] Esperando URLs...\n" RESET, (unsigned long)pthread_self());
            pthread_cond_wait(&queue_cv, &queue_mutex);
        }

        if (url_queue.front) {
            task = dequeue();
        } else if (!processing) {
            printf(BLUE "[HILO %lu] Terminando (sin URLs y procesamiento finalizado)\n" RESET, (unsigned long)pthread_self());
            pthread_mutex_unlock(&queue_mutex);
            break;
        }

        pthread_mutex_unlock(&queue_mutex);

        if (task) {
            printf(BLUE "[HILO %lu] Procesando %s (profundidad %d)\n" RESET, (unsigned long)pthread_self(), task->url, task->depth);
            char *html = fetch_html(task->url);
            if (html) {
                extract_links(html, task->depth);
                free(html);
            }
            free(task->url);
            free(task);
            print_memory_usage();
        }
    }
    return NULL;
 }

 int main(int argc, char *argv[]) {
    if (argc != 4) {
        printf(RED "Uso: %s [URL] [PROFUNDIDAD] [HILOS]\n" RESET, argv[0]);
        return 1;
    }

    double start_time = get_time();
    curl_global_init(CURL_GLOBAL_ALL);

    int depth = (strcmp(argv[2], "auto") == 0) ? 5 : atoi(argv[2]);
    if (depth < 0 || depth > MAX_DEPTH) depth = MAX_DEPTH;
    int num_threads = (strcmp(argv[3], "auto") == 0) ? get_available_threads() : atoi(argv[3]);

    pthread_t threads[num_threads];
    for (int i = 0; i < num_threads; i++) {
        if (pthread_create(&threads[i], NULL, worker_thread, NULL) != 0) {
            printf(RED "[ERROR] Fallo al crear hilo %d\n" RESET, i);
        } else {
            printf(GREEN "[MAIN] Hilo %d creado\n" RESET, i);
        }
    }

    enqueue(argv[1], 0); // Encolar y señalizar

    sleep(60);
    pthread_mutex_lock(&queue_mutex);
    processing = false;
    printf(YELLOW "[MAIN] Finalizando procesamiento\n" RESET);
    pthread_cond_broadcast(&queue_cv);
    pthread_mutex_unlock(&queue_mutex);

    for (int i = 0; i < num_threads; i++) {
        pthread_join(threads[i], NULL);
        printf(GREEN "[MAIN] Hilo %d terminado\n" RESET, i);
    }

    double end_time = get_time();
    printf(GREEN "[INFO] Proceso completado en %.3f s. URLs procesadas: %d\n" RESET, end_time - start_time, processed_urls);
    print_memory_usage();

    curl_global_cleanup();
    return 0;
 }
	#include <stdio.h>
	#include <stdlib.h>
	#include <string.h>
	#include <curl/curl.h>
	#include <regex.h>
	#include <pthread.h>
	#include <unistd.h>
	#include <sys/time.h>
	#include <sys/resource.h>
	#include <stdbool.h>

	#ifdef __linux__
	#include <sys/sysinfo.h>
	#elif defined(__APPLE__)
	#include <sys/sysctl.h>
	#endif

	#define RED "\x1b[31m"
	#define GREEN "\x1b[32m"
	#define YELLOW "\x1b[33m"
	#define BLUE "\x1b[34m"
	#define RESET "\x1b[0m"

	#define FILE_PATTERN "href=\"([^\"]+\\.(pdf\|txt\|md\|doc\|rtf))\""
	#define URL_PATTERN "href=\"([^\"]+)\""
	#define MAX_DEPTH 10
	#define MAX_URLS 1000

	pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER;
	pthread_cond_t queue_cv = PTHREAD_COND_INITIALIZER;

	struct URLNode {
	char *url;
	int depth;
	struct URLNode *next;
	};

	struct URLQueue {
	struct URLNode front, rear;
	int count;
	} url_queue = {NULL, NULL, 0};

	int processed_urls = 0;
	bool processing = true;

	double get_time() {
	struct timeval tv;
	gettimeofday(&tv, NULL);
	return tv.tv_sec + tv.tv_usec / 1000000.0;
	}

	void print_memory_usage() {
	struct rusage usage;
	getrusage(RUSAGE_SELF, &usage);
	printf(YELLOW "[MEMORIA] Uso actual: %ld KB\n" RESET, usage.ru_maxrss);
	}

	int get_available_threads() {
	#ifdef __linux__
	return get_nprocs();
	#elif defined(__APPLE__)
	int numCPU;
	size_t size = sizeof(numCPU);
	sysctlbyname("hw.ncpu", &numCPU, &size, NULL, 0);
	return numCPU;
	#else
	return sysconf(_SC_NPROCESSORS_ONLN);
	#endif
	}

	void enqueue(const char *url, int depth) {
	struct URLNode *new_node = malloc(sizeof(struct URLNode));
	if (!new_node) {
	fprintf(stderr, RED "[ERROR] No se pudo asignar memoria para nodo\n" RESET);
	return;
	}
	new_node->url = strdup(url);
	if (!new_node->url) {
	free(new_node);
	fprintf(stderr, RED "[ERROR] No se pudo duplicar URL\n" RESET);
	return;
	}
	new_node->depth = depth;
	new_node->next = NULL;

	pthread_mutex_lock(&queue_mutex);
	if (url_queue.count >= MAX_URLS) {
	printf(YELLOW "[WARN] Límite de URLs alcanzado (%d)\n" RESET, MAX_URLS);
	free(new_node->url);
	free(new_node);
	} else {
	if (!url_queue.rear) url_queue.front = url_queue.rear = new_node;
	else url_queue.rear->next = new_node, url_queue.rear = new_node;
	url_queue.count++;
	printf(GREEN "[COLA] Encolada %s (profundidad %d, total %d)\n" RESET, url, depth, url_queue.count);
	}
	printf(YELLOW "[COLA] Enviando broadcast a los hilos\n" RESET);
	pthread_cond_broadcast(&queue_cv);
	pthread_mutex_unlock(&queue_mutex);
	}

	struct URLNode *dequeue() {
	if (!url_queue.front) {
	printf(YELLOW "[COLA] Cola vacía\n" RESET);
	return NULL;
	}
	struct URLNode *node = url_queue.front;
	url_queue.front = node->next;
	if (!url_queue.front) url_queue.rear = NULL;
	url_queue.count--;
	printf(BLUE "[COLA] Desencolada %s (restan %d)\n" RESET, node->url, url_queue.count);
	return node;
	}

	size_t WriteMemoryCallback(void contents, size_t size, size_t nmemb, char *userp) {
	size_t realSize = size * nmemb;
	userp = realloc(userp, realSize + 1);
	if (!*userp) {
	fprintf(stderr, RED "[ERROR] Fallo en realloc\n" RESET);
	return 0;
	}
	memcpy(*userp, contents, realSize);
	(*userp)[realSize] = '\0';
	return realSize;
	}

	char fetch_html(const char url) {
	double start = get_time();
	CURL *curl = curl_easy_init();
	if (!curl) {
	printf(RED "[ERROR] Fallo al inicializar CURL\n" RESET);
	return NULL;
	}
	char *response = malloc(1);
	if (!response) {
	curl_easy_cleanup(curl);
	return NULL;
	}
	response[0] = '\0';

	curl_easy_setopt(curl, CURLOPT_URL, url);
	curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
	curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response);
	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
	curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L);
	curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L);
	curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MyScraper/1.0)");

	CURLcode res = curl_easy_perform(curl);
	double end = get_time();
	if (res == CURLE_OK) {
	printf(GREEN "[DESCARGA] HTML de %s descargado en %.3f s\n" RESET, url, end - start);
	} else {
	printf(RED "[ERROR] Fallo al descargar %s: %s\n" RESET, url, curl_easy_strerror(res));
	free(response);
	response = NULL;
	}
	curl_easy_cleanup(curl);
	return response;
	}

	void download_file(const char *url) {
	double start = get_time();
	CURL *curl = curl_easy_init();
	if (!curl) return;

	char filename[256];
	snprintf(filename, sizeof(filename), "downloaded_%d_%s", processed_urls, strrchr(url, '/') ? strrchr(url, '/') + 1 : "file");

	FILE *fp = fopen(filename, "wb");
	if (!fp) {
	printf(RED "[ERROR] No se pudo abrir %s para escritura\n" RESET, filename);
	curl_easy_cleanup(curl);
	return;
	}

	curl_easy_setopt(curl, CURLOPT_URL, url);
	curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp);
	curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
	curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MyScraper/1.0)");

	CURLcode res = curl_easy_perform(curl);
	double end = get_time();
	fclose(fp);
	if (res == CURLE_OK) {
	printf(GREEN "[ARCHIVO] Descargado %s como %s en %.3f s\n" RESET, url, filename, end - start);
	} else {
	printf(RED "[ERROR] Fallo al descargar archivo %s: %s\n" RESET, url, curl_easy_strerror(res));
	remove(filename);
	}
	curl_easy_cleanup(curl);
	}

	void extract_links(char *html, int depth) {
	regex_t url_regex, file_regex;
	if (regcomp(&url_regex, URL_PATTERN, REG_EXTENDED) \|\| regcomp(&file_regex, FILE_PATTERN, REG_EXTENDED)) {
	printf(RED "[ERROR] Fallo al compilar regex\n" RESET);
	return;
	}

	char *cursor = html;
	regmatch_t matches[3];
	while (!regexec(&url_regex, cursor, 2, matches, 0)) {
	int len = matches[1].rm_eo - matches[1].rm_so;
	char *url = strndup(cursor + matches[1].rm_so, len);
	if (depth + 1 <= MAX_DEPTH) {
	if (!regexec(&file_regex, url, 3, matches, 0)) {
	download_file(url);
	processed_urls++;
	} else {
	enqueue(url, depth + 1);
	}
	}
	free(url);
	cursor += matches[0].rm_eo; // Corregido: Usar el fin de la coincidencia completa
	}
	regfree(&url_regex);
	regfree(&file_regex);
	}

	void worker_thread(void unused) {
	(void)unused;
	printf(BLUE "[HILO %lu] Iniciado\n" RESET, (unsigned long)pthread_self());
	while (true) {
	struct URLNode *task = NULL;

	pthread_mutex_lock(&queue_mutex);
	while (!url_queue.front && processing) {
	printf(YELLOW "[HILO %lu] Esperando URLs...\n" RESET, (unsigned long)pthread_self());
	pthread_cond_wait(&queue_cv, &queue_mutex);
	}

	if (url_queue.front) {
	task = dequeue();
	} else if (!processing) {
	printf(BLUE "[HILO %lu] Terminando (sin URLs y procesamiento finalizado)\n" RESET, (unsigned long)pthread_self());
	pthread_mutex_unlock(&queue_mutex);
	break;
	}

	pthread_mutex_unlock(&queue_mutex);

	if (task) {
	printf(BLUE "[HILO %lu] Procesando %s (profundidad %d)\n" RESET, (unsigned long)pthread_self(), task->url, task->depth);
	char *html = fetch_html(task->url);
	if (html) {
	extract_links(html, task->depth);
	free(html);
	}
	free(task->url);
	free(task);
	print_memory_usage();
	}
	}
	return NULL;
	}

	int main(int argc, char *argv[]) {
	if (argc != 4) {
	printf(RED "Uso: %s [URL] [PROFUNDIDAD] [HILOS]\n" RESET, argv[0]);
	return 1;
	}

	double start_time = get_time();
	curl_global_init(CURL_GLOBAL_ALL);

	int depth = (strcmp(argv[2], "auto") == 0) ? 5 : atoi(argv[2]);
	if (depth < 0 \|\| depth > MAX_DEPTH) depth = MAX_DEPTH;
	int num_threads = (strcmp(argv[3], "auto") == 0) ? get_available_threads() : atoi(argv[3]);

	pthread_t threads[num_threads];
	for (int i = 0; i < num_threads; i++) {
	if (pthread_create(&threads[i], NULL, worker_thread, NULL) != 0) {
	printf(RED "[ERROR] Fallo al crear hilo %d\n" RESET, i);
	} else {
	printf(GREEN "[MAIN] Hilo %d creado\n" RESET, i);
	}
	}

	enqueue(argv[1], 0); // Encolar y señalizar

	sleep(60);
	pthread_mutex_lock(&queue_mutex);
	processing = false;
	printf(YELLOW "[MAIN] Finalizando procesamiento\n" RESET);
	pthread_cond_broadcast(&queue_cv);
	pthread_mutex_unlock(&queue_mutex);

	for (int i = 0; i < num_threads; i++) {
	pthread_join(threads[i], NULL);
	printf(GREEN "[MAIN] Hilo %d terminado\n" RESET, i);
	}

	double end_time = get_time();
	printf(GREEN "[INFO] Proceso completado en %.3f s. URLs procesadas: %d\n" RESET, end_time - start_time, processed_urls);
	print_memory_usage();

	curl_global_cleanup();
	return 0;
	}