|
#include <stdio.h> |
|
#include <stdlib.h> |
|
#include <string.h> |
|
#include <curl/curl.h> |
|
#include <regex.h> |
|
#include <pthread.h> |
|
#include <unistd.h> |
|
#include <sys/time.h> |
|
#include <sys/resource.h> |
|
#include <stdbool.h> |
|
|
|
#ifdef __linux__ |
|
#include <sys/sysinfo.h> |
|
#elif defined(__APPLE__) |
|
#include <sys/sysctl.h> |
|
#endif |
|
|
|
#define RED "\x1b[31m" |
|
#define GREEN "\x1b[32m" |
|
#define YELLOW "\x1b[33m" |
|
#define BLUE "\x1b[34m" |
|
#define RESET "\x1b[0m" |
|
|
|
#define FILE_PATTERN "href=\"([^\"]+\\.(pdf|txt|md|doc|rtf))\"" |
|
#define URL_PATTERN "href=\"([^\"]+)\"" |
|
#define MAX_DEPTH 10 |
|
#define MAX_URLS 1000 |
|
|
|
pthread_mutex_t queue_mutex = PTHREAD_MUTEX_INITIALIZER; |
|
pthread_cond_t queue_cv = PTHREAD_COND_INITIALIZER; |
|
|
|
struct URLNode { |
|
char *url; |
|
int depth; |
|
struct URLNode *next; |
|
}; |
|
|
|
struct URLQueue { |
|
struct URLNode *front, *rear; |
|
int count; |
|
} url_queue = {NULL, NULL, 0}; |
|
|
|
int processed_urls = 0; |
|
bool processing = true; |
|
|
|
double get_time() { |
|
struct timeval tv; |
|
gettimeofday(&tv, NULL); |
|
return tv.tv_sec + tv.tv_usec / 1000000.0; |
|
} |
|
|
|
void print_memory_usage() { |
|
struct rusage usage; |
|
getrusage(RUSAGE_SELF, &usage); |
|
printf(YELLOW "[MEMORIA] Uso actual: %ld KB\n" RESET, usage.ru_maxrss); |
|
} |
|
|
|
int get_available_threads() { |
|
#ifdef __linux__ |
|
return get_nprocs(); |
|
#elif defined(__APPLE__) |
|
int numCPU; |
|
size_t size = sizeof(numCPU); |
|
sysctlbyname("hw.ncpu", &numCPU, &size, NULL, 0); |
|
return numCPU; |
|
#else |
|
return sysconf(_SC_NPROCESSORS_ONLN); |
|
#endif |
|
} |
|
|
|
void enqueue(const char *url, int depth) { |
|
struct URLNode *new_node = malloc(sizeof(struct URLNode)); |
|
if (!new_node) { |
|
fprintf(stderr, RED "[ERROR] No se pudo asignar memoria para nodo\n" RESET); |
|
return; |
|
} |
|
new_node->url = strdup(url); |
|
if (!new_node->url) { |
|
free(new_node); |
|
fprintf(stderr, RED "[ERROR] No se pudo duplicar URL\n" RESET); |
|
return; |
|
} |
|
new_node->depth = depth; |
|
new_node->next = NULL; |
|
|
|
pthread_mutex_lock(&queue_mutex); |
|
if (url_queue.count >= MAX_URLS) { |
|
printf(YELLOW "[WARN] Límite de URLs alcanzado (%d)\n" RESET, MAX_URLS); |
|
free(new_node->url); |
|
free(new_node); |
|
} else { |
|
if (!url_queue.rear) url_queue.front = url_queue.rear = new_node; |
|
else url_queue.rear->next = new_node, url_queue.rear = new_node; |
|
url_queue.count++; |
|
printf(GREEN "[COLA] Encolada %s (profundidad %d, total %d)\n" RESET, url, depth, url_queue.count); |
|
} |
|
printf(YELLOW "[COLA] Enviando broadcast a los hilos\n" RESET); |
|
pthread_cond_broadcast(&queue_cv); |
|
pthread_mutex_unlock(&queue_mutex); |
|
} |
|
|
|
struct URLNode *dequeue() { |
|
if (!url_queue.front) { |
|
printf(YELLOW "[COLA] Cola vacía\n" RESET); |
|
return NULL; |
|
} |
|
struct URLNode *node = url_queue.front; |
|
url_queue.front = node->next; |
|
if (!url_queue.front) url_queue.rear = NULL; |
|
url_queue.count--; |
|
printf(BLUE "[COLA] Desencolada %s (restan %d)\n" RESET, node->url, url_queue.count); |
|
return node; |
|
} |
|
|
|
size_t WriteMemoryCallback(void *contents, size_t size, size_t nmemb, char **userp) { |
|
size_t realSize = size * nmemb; |
|
*userp = realloc(*userp, realSize + 1); |
|
if (!*userp) { |
|
fprintf(stderr, RED "[ERROR] Fallo en realloc\n" RESET); |
|
return 0; |
|
} |
|
memcpy(*userp, contents, realSize); |
|
(*userp)[realSize] = '\0'; |
|
return realSize; |
|
} |
|
|
|
char *fetch_html(const char *url) { |
|
double start = get_time(); |
|
CURL *curl = curl_easy_init(); |
|
if (!curl) { |
|
printf(RED "[ERROR] Fallo al inicializar CURL\n" RESET); |
|
return NULL; |
|
} |
|
char *response = malloc(1); |
|
if (!response) { |
|
curl_easy_cleanup(curl); |
|
return NULL; |
|
} |
|
response[0] = '\0'; |
|
|
|
curl_easy_setopt(curl, CURLOPT_URL, url); |
|
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback); |
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &response); |
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); |
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYPEER, 0L); |
|
curl_easy_setopt(curl, CURLOPT_SSL_VERIFYHOST, 0L); |
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MyScraper/1.0)"); |
|
|
|
CURLcode res = curl_easy_perform(curl); |
|
double end = get_time(); |
|
if (res == CURLE_OK) { |
|
printf(GREEN "[DESCARGA] HTML de %s descargado en %.3f s\n" RESET, url, end - start); |
|
} else { |
|
printf(RED "[ERROR] Fallo al descargar %s: %s\n" RESET, url, curl_easy_strerror(res)); |
|
free(response); |
|
response = NULL; |
|
} |
|
curl_easy_cleanup(curl); |
|
return response; |
|
} |
|
|
|
void download_file(const char *url) { |
|
double start = get_time(); |
|
CURL *curl = curl_easy_init(); |
|
if (!curl) return; |
|
|
|
char filename[256]; |
|
snprintf(filename, sizeof(filename), "downloaded_%d_%s", processed_urls, strrchr(url, '/') ? strrchr(url, '/') + 1 : "file"); |
|
|
|
FILE *fp = fopen(filename, "wb"); |
|
if (!fp) { |
|
printf(RED "[ERROR] No se pudo abrir %s para escritura\n" RESET, filename); |
|
curl_easy_cleanup(curl); |
|
return; |
|
} |
|
|
|
curl_easy_setopt(curl, CURLOPT_URL, url); |
|
curl_easy_setopt(curl, CURLOPT_WRITEDATA, fp); |
|
curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L); |
|
curl_easy_setopt(curl, CURLOPT_USERAGENT, "Mozilla/5.0 (compatible; MyScraper/1.0)"); |
|
|
|
CURLcode res = curl_easy_perform(curl); |
|
double end = get_time(); |
|
fclose(fp); |
|
if (res == CURLE_OK) { |
|
printf(GREEN "[ARCHIVO] Descargado %s como %s en %.3f s\n" RESET, url, filename, end - start); |
|
} else { |
|
printf(RED "[ERROR] Fallo al descargar archivo %s: %s\n" RESET, url, curl_easy_strerror(res)); |
|
remove(filename); |
|
} |
|
curl_easy_cleanup(curl); |
|
} |
|
|
|
void extract_links(char *html, int depth) { |
|
regex_t url_regex, file_regex; |
|
if (regcomp(&url_regex, URL_PATTERN, REG_EXTENDED) || regcomp(&file_regex, FILE_PATTERN, REG_EXTENDED)) { |
|
printf(RED "[ERROR] Fallo al compilar regex\n" RESET); |
|
return; |
|
} |
|
|
|
char *cursor = html; |
|
regmatch_t matches[3]; |
|
while (!regexec(&url_regex, cursor, 2, matches, 0)) { |
|
int len = matches[1].rm_eo - matches[1].rm_so; |
|
char *url = strndup(cursor + matches[1].rm_so, len); |
|
if (depth + 1 <= MAX_DEPTH) { |
|
if (!regexec(&file_regex, url, 3, matches, 0)) { |
|
download_file(url); |
|
processed_urls++; |
|
} else { |
|
enqueue(url, depth + 1); |
|
} |
|
} |
|
free(url); |
|
cursor += matches[0].rm_eo; // Corregido: Usar el fin de la coincidencia completa |
|
} |
|
regfree(&url_regex); |
|
regfree(&file_regex); |
|
} |
|
|
|
void *worker_thread(void *unused) { |
|
(void)unused; |
|
printf(BLUE "[HILO %lu] Iniciado\n" RESET, (unsigned long)pthread_self()); |
|
while (true) { |
|
struct URLNode *task = NULL; |
|
|
|
pthread_mutex_lock(&queue_mutex); |
|
while (!url_queue.front && processing) { |
|
printf(YELLOW "[HILO %lu] Esperando URLs...\n" RESET, (unsigned long)pthread_self()); |
|
pthread_cond_wait(&queue_cv, &queue_mutex); |
|
} |
|
|
|
if (url_queue.front) { |
|
task = dequeue(); |
|
} else if (!processing) { |
|
printf(BLUE "[HILO %lu] Terminando (sin URLs y procesamiento finalizado)\n" RESET, (unsigned long)pthread_self()); |
|
pthread_mutex_unlock(&queue_mutex); |
|
break; |
|
} |
|
|
|
pthread_mutex_unlock(&queue_mutex); |
|
|
|
if (task) { |
|
printf(BLUE "[HILO %lu] Procesando %s (profundidad %d)\n" RESET, (unsigned long)pthread_self(), task->url, task->depth); |
|
char *html = fetch_html(task->url); |
|
if (html) { |
|
extract_links(html, task->depth); |
|
free(html); |
|
} |
|
free(task->url); |
|
free(task); |
|
print_memory_usage(); |
|
} |
|
} |
|
return NULL; |
|
} |
|
|
|
int main(int argc, char *argv[]) { |
|
if (argc != 4) { |
|
printf(RED "Uso: %s [URL] [PROFUNDIDAD] [HILOS]\n" RESET, argv[0]); |
|
return 1; |
|
} |
|
|
|
double start_time = get_time(); |
|
curl_global_init(CURL_GLOBAL_ALL); |
|
|
|
int depth = (strcmp(argv[2], "auto") == 0) ? 5 : atoi(argv[2]); |
|
if (depth < 0 || depth > MAX_DEPTH) depth = MAX_DEPTH; |
|
int num_threads = (strcmp(argv[3], "auto") == 0) ? get_available_threads() : atoi(argv[3]); |
|
|
|
pthread_t threads[num_threads]; |
|
for (int i = 0; i < num_threads; i++) { |
|
if (pthread_create(&threads[i], NULL, worker_thread, NULL) != 0) { |
|
printf(RED "[ERROR] Fallo al crear hilo %d\n" RESET, i); |
|
} else { |
|
printf(GREEN "[MAIN] Hilo %d creado\n" RESET, i); |
|
} |
|
} |
|
|
|
enqueue(argv[1], 0); // Encolar y señalizar |
|
|
|
sleep(60); |
|
pthread_mutex_lock(&queue_mutex); |
|
processing = false; |
|
printf(YELLOW "[MAIN] Finalizando procesamiento\n" RESET); |
|
pthread_cond_broadcast(&queue_cv); |
|
pthread_mutex_unlock(&queue_mutex); |
|
|
|
for (int i = 0; i < num_threads; i++) { |
|
pthread_join(threads[i], NULL); |
|
printf(GREEN "[MAIN] Hilo %d terminado\n" RESET, i); |
|
} |
|
|
|
double end_time = get_time(); |
|
printf(GREEN "[INFO] Proceso completado en %.3f s. URLs procesadas: %d\n" RESET, end_time - start_time, processed_urls); |
|
print_memory_usage(); |
|
|
|
curl_global_cleanup(); |
|
return 0; |
|
} |