Skip to content

Instantly share code, notes, and snippets.

@thiswillbeyourgithub
Last active April 11, 2025 11:37
Show Gist options
  • Save thiswillbeyourgithub/036df5b4f1d8c48950c085c42955e099 to your computer and use it in GitHub Desktop.
Save thiswillbeyourgithub/036df5b4f1d8c48950c085c42955e099 to your computer and use it in GitHub Desktop.
Phone book downloader
/**
* Converts HTML phone book to CSV format
* @param {string} htmlContent - The HTML content to convert
* @returns {string} The converted CSV content
*/
function phoneBookCSVConverter(htmlContent) {
console.log('Starting CSV conversion...');
try {
// Create a DOM parser to work with the HTML content
const parser = new DOMParser();
const doc = parser.parseFromString(htmlContent, 'text/html');
// Find all TabSmall_Row_Item_Alt and TabSmall_Row_Item elements
const rows = doc.querySelectorAll('.TabSmall_Row_Item_Alt, .TabSmall_Row_Item');
console.log(`Found ${rows.length} TabSmall_Row_Item_Alt and TabSmall_Row_Item elements`);
// If no rows are found in the direct HTML, try looking in each section
let allRows = [];
if (rows.length === 0) {
console.log('No rows found directly, trying to extract from sections...');
const sections = doc.querySelectorAll('section');
for (const section of sections) {
const sectionRows = section.querySelectorAll('tr');
allRows = [...allRows, ...sectionRows];
}
console.log(`Found ${allRows.length} rows from all sections`);
} else {
allRows = rows;
}
// Assign specific classes to table cells in rows with 6 td elements
allRows.forEach(row => {
const cells = row.querySelectorAll('td');
if (cells.length === 6) {
// Assign classes to each td based on position
cells[0].classList.add('nom');
cells[1].classList.add('prenom');
cells[2].classList.add('localisation');
cells[3].classList.add('numero');
cells[4].classList.add('numero_sda');
cells[5].classList.add('email');
}
});
// Replace textContent with title attribute for localisation elements
const hierarchieElements = doc.querySelectorAll('.localisation');
hierarchieElements.forEach(element => {
const titleAttr = element.getAttribute('title');
if (titleAttr) {
element.textContent = titleAttr;
}
});
// Prepare headers for CSV
const headers = 'nom,prenom,localisation,numero,numero_sda,email';
// Create a Set to track unique rows for deduplication
const uniqueRows = new Set();
// Array to store all valid rows for sorting
const validRows = [];
// Escape function for CSV fields
const escapeForCSV = (text) => {
text = text.replace(/"/g, '""'); // Double quotes within field
return text.includes(',') ? `"${text}"` : text; // Wrap in quotes if contains comma
};
// Process each row
allRows.forEach(row => {
// Get values for each column by their class names
const nom = row.querySelector('.nom')?.textContent.trim() || '';
const prenom = row.querySelector('.prenom')?.textContent.trim() || '';
const hierarchie = row.querySelector('.localisation')?.textContent.trim() || '';
const numero = row.querySelector('.numero')?.textContent.trim() || '';
const numero_sda = row.querySelector('.numero_sda')?.textContent.trim() || '';
const email = row.querySelector('.email')?.textContent.trim() || '';
// Only include rows that have at least one of numero, numero_sda, or email
// AND where nom + prenom + localisation is at least 3 characters long
if ((numero || numero_sda || email) && (nom.length + prenom.length + hierarchie.length >= 3)) {
// Create a row with the properly escaped values
const rowData = [
escapeForCSV(nom),
escapeForCSV(prenom),
escapeForCSV(hierarchie),
escapeForCSV(numero),
escapeForCSV(numero_sda),
escapeForCSV(email)
];
// Create a string representation for deduplication
const rowString = rowData.join(',');
// Only add unique rows
if (!uniqueRows.has(rowString)) {
uniqueRows.add(rowString);
validRows.push({
sortKey: `${nom.toLowerCase()}${prenom.toLowerCase()}`,
data: rowString
});
}
}
});
// Sort the rows lexicographically by nom+prenom
validRows.sort((a, b) => a.sortKey.localeCompare(b.sortKey));
// Build the final CSV content
let csvContent = headers + '\n';
validRows.forEach(row => {
csvContent += row.data + '\n';
});
console.log(`CSV conversion complete. Generated ${validRows.length} data rows after filtering and deduplication`);
// Create a date string in readable format (YYYY-MM-DD)
const now = new Date();
const dateStr = now.toISOString().split('T')[0]; // Gets YYYY-MM-DD format
// Download the CSV file with date in the filename
const blob = new Blob([csvContent], {type: 'text/csv;charset=utf-8'});
const url = URL.createObjectURL(blob);
const link = document.createElement('a');
link.href = url;
link.download = `PhoneBook_${dateStr}.csv`;
console.log('Triggering CSV download with filename:', link.download);
link.click();
console.log('CSV download triggered');
return csvContent;
} catch (error) {
console.error('Error converting HTML to CSV:', error);
console.error('Stack trace:', error.stack);
return 'Error,during,conversion\n';
}
}
// Helper function to find elements across document and all frames
function findElementInAllFrames(selector, findAll = false) {
// First try the main document
if (findAll) {
const mainDocResults = document.querySelectorAll(selector);
if (mainDocResults && mainDocResults.length > 0) {
return mainDocResults;
}
} else {
const mainDocResult = document.querySelector(selector);
if (mainDocResult) {
return mainDocResult;
}
}
// If not found, try each frame
try {
const frameCount = window.frames.length;
for (let i = 0; i < frameCount; i++) {
try {
const frame = window.frames[i];
const frameDoc = frame.document;
if (!frameDoc) {
continue;
}
if (findAll) {
const frameResults = frameDoc.querySelectorAll(selector);
if (frameResults && frameResults.length > 0) {
return frameResults;
}
} else {
const frameResult = frameDoc.querySelector(selector);
if (frameResult) {
return frameResult;
}
}
} catch (frameError) {
console.warn(`Error accessing frame ${i}:`, frameError);
}
}
} catch (framesError) {
console.error('Error accessing frames:', framesError);
}
return findAll ? [] : null;
}
// Define the scrapePhoneBook function with async to allow await usage
async function scrapePhoneBook() {
// Start the search for letter elements
console.log('Starting letter element search...');
// Find all letter elements using the specified selector pattern
let letterElements = [];
let index = 1;
let element;
// Original strategy with the specific selector
console.log('Using specific selector strategy to find letter elements');
// Check positions 1 through 35 (letters A-Z plus potential additional elements)
while (index <= 35) {
element = findElementInAllFrames(`#tabsABC > tbody > tr > td:nth-child(${index}) > a`);
if (element !== null) {
letterElements.push(element);
}
index++;
}
console.log(`Found ${letterElements.length} letter elements`);
// Log the letter elements we found
console.log(`Final result: Found ${letterElements.length} letter elements to process`);
// Log each letter and its properties for diagnosis
letterElements.forEach((el, idx) => {
console.log(`Letter ${idx + 1}: "${el.textContent.trim()}" href=${el.getAttribute('href')}`);
});
// Proceed with at least some letters, but warn if we don't have the expected count
if (letterElements.length < 26) {
console.warn(`Expected 26 letter elements, but found only ${letterElements.length}. Will attempt to proceed anyway.`);
}
if (letterElements.length === 0) {
throw new Error('Could not find any letter elements after multiple attempts. Aborting.');
}
let result = '<html><body>';
// Store the total number of letter elements we originally found
const totalLetterElements = letterElements.length;
// Track the total number of table rows processed so far
let totalRowsProcessed = 0;
// Instead of relying on reverse order, find Z explicitly and reorder the array
// to ensure we start with Z, then Y, X, etc.
const letterContents = letterElements.map(el => el.textContent.trim());
console.log(`Found letter elements: ${letterContents.join(', ')}`);
// Find the index of Z in the original array
let zIndex = -1;
for (let i = 0; i < letterElements.length; i++) {
if (letterElements[i].textContent.trim() === 'Z') {
zIndex = i;
break;
}
}
// Create the indices array ensuring Z is first, then Y, X, etc.
let letterIndices = [];
if (zIndex !== -1) {
console.log(`Found Z at index ${zIndex}`);
// Start with Z and go backwards through the alphabet
for (let i = 0; i < totalLetterElements; i++) {
// Calculate the index: start with Z's index, then go backwards wrapping around if needed
let letterIdx = (zIndex - i + totalLetterElements) % totalLetterElements;
letterIndices.push(letterIdx);
}
} else {
console.warn("Z not found! Falling back to reverse order");
// Fallback to reverse order
letterIndices = Array.from({length: totalLetterElements}, (_, i) => totalLetterElements - i - 1);
}
// Log the letter sequence for debugging
const letterSequence = letterIndices.map(idx => letterElements[idx]?.textContent.trim() || '?').join(', ');
console.log(`Letter elements will be processed in this order: ${letterSequence}`);
// Verify the first letter is indeed Z
if (letterElements.length > 0) {
const firstLetter = letterElements[letterIndices[0]]?.textContent.trim() || '?';
console.log(`First letter to process: "${firstLetter}"`);
if (firstLetter !== 'Z') {
console.warn(`Warning: First letter is not Z as expected!`);
}
}
// Process all letters the same way
for (let letterPos = 0; letterPos < letterIndices.length; letterPos++) {
const i = letterIndices[letterPos];
let clickSuccess = false; // Initialize clickSuccess for each letter
console.log(`\n==== Processing letter index ${totalLetterElements-i}/${totalLetterElements} (actual index: ${i}) ====`);
console.log(`Processing letter index ${totalLetterElements-i} (actual index: ${i})`);
// Reload the letter elements before each iteration to ensure we're working with fresh DOM elements
console.log(`Reloading letter elements for index ${i+1}`);
letterElements = [];
// Use original strategy with the specific selector
let index = 1;
while (index <= 35) {
const element = findElementInAllFrames(`#tabsABC > tbody > tr > td:nth-child(${index}) > a`);
if (element !== null) {
letterElements.push(element);
}
index++;
}
console.log(`Reloaded ${letterElements.length} letter elements for processing index ${i+1}`);
// Check if we have enough elements
if (i >= letterElements.length) {
console.error(`Index ${i+1} is out of bounds for reloaded elements array (length: ${letterElements.length})`);
continue;
}
// Get the current letter element
const letterElement = letterElements[i];
// Define letter outside try block so it's available in the catch block
let letter = "unknown";
try {
// Only set letter if letterElement exists
if (letterElement) {
letter = letterElement.textContent.trim();
}
console.log(`Processing letter ${letter} (${i+1}/${totalLetterElements})`);
console.log(`Processing letter "${letter}"`);
// Use the letterElement directly as our link
const letterLink = letterElement;
console.log(`Processing letter element for "${letter}"`);
// Log the element we're about to click
console.log(`Processing letter element:`, {
text: letterLink.textContent,
href: letterLink.getAttribute('href'),
id: letterLink.id,
classes: letterLink.className
});
console.log(`Clicking on letter "${letter}" link`);
let clickAttempts = 0;
const maxClickAttempts = 2; // Allow 2 attempts for all letters for better reliability
// Defensive check - ensure letterLink is not null before proceeding
if (!letterLink) {
console.error(`Letter link for "${letter}" is null or undefined. Skipping this letter.`);
continue;
}
// Validate the link is properly configured before attempting to click
console.log(`Checking letter link properties for "${letter}":`, {
exists: !!letterLink,
isElement: letterLink instanceof Element,
hasClick: typeof letterLink.click === 'function',
href: letterLink.getAttribute('href')
});
while (!clickSuccess && clickAttempts < maxClickAttempts) {
clickAttempts++;
console.log(`Click attempt ${clickAttempts}/${maxClickAttempts} for letter "${letter}"`);
try {
// Regular click method
if (typeof letterLink.click === 'function') {
letterLink.click();
} else {
console.warn(`Letter "${letter}" element does not have click method`);
}
// Wait for page to load with a fixed delay
console.log(`Waiting for page to load after clicking "${letter}" (attempt ${clickAttempts})...`);
const startWait = Date.now();
// Fixed wait time of 2 seconds for better reliability
await new Promise(resolve => setTimeout(resolve, 2000));
clickSuccess = true;
console.log(`Waited ${Date.now() - startWait}ms after clicking "${letter}"`);
console.log(`Proceeding with letter "${letter}" after ${clickAttempts} attempts`);
} catch (clickError) {
console.error(`Error during click attempt ${clickAttempts} for letter "${letter}":`, clickError);
await new Promise(resolve => setTimeout(resolve, 1000));
}
}
if (!clickSuccess) {
console.error(`Failed to navigate to letter "${letter}" after multiple attempts`);
// Try to continue with the next letter rather than halting
continue;
}
// Add a small additional wait to ensure page is fully rendered
await new Promise(resolve => setTimeout(resolve, 500));
// No need to check if page content changed, just log current URL for debugging
console.log(`Current URL after clicking and waiting: ${window.location.href}`);
// Simplified approach: directly find table rows for current letter
console.log(`Looking for table rows for letter "${letter}"`);
const rows = findElementInAllFrames('tr', true);
console.log(`Found ${rows.length} total rows in the document`);
// Filter out rows that are likely navigation/header rows (usually shorter)
const contentRows = Array.from(rows).filter(row => {
// Skip very short rows or rows with mostly links (likely navigation)
return row.cells && row.cells.length > 1 &&
row.textContent.trim().length > 5;
});
console.log(`Filtered to ${contentRows.length} content rows for letter "${letter}"`);
// Track the previous count and update the total
const prevRowCount = totalRowsProcessed;
totalRowsProcessed += contentRows.length;
console.log(`Letter "${letter}": Added ${contentRows.length} rows. Total rows so far: ${totalRowsProcessed}`);
// Check if this iteration added any new rows - just log a warning but continue processing
if (totalRowsProcessed <= prevRowCount) {
console.warn(`No new rows were added for letter "${letter}" (still at ${totalRowsProcessed} rows)`);
// Process the section even if no new rows were found
}
// Add a section with id attribute for each letter
// For the "*" special character, use "all" as the ID
const letterId = letter === '*' ? 'all' : letter.toLowerCase();
result += `<section id="letter-${letterId}">\n`;
result += `<h2>${letter === '*' ? 'All' : 'Letter ' + letter}</h2>\n`;
// Create a table with these rows
result += '<table>\n';
for (const row of contentRows) {
result += row.outerHTML + '\n';
}
result += '</table>\n';
result += '</section>\n\n';
// Check if we found any rows for this letter - warn if not but don't crash
if (contentRows.length === 0) {
console.warn(`No rows found for letter "${letter}" - will continue with next letter`);
}
console.log(`Successfully processed letter "${letter}" with ${contentRows.length} entries`);
// Print current accumulated content size
console.log(`Current accumulated HTML size: ${result.length} characters`);
} catch (error) {
console.error(`Error processing letter "${letter}":`, error);
console.error(`Stack trace:`, error.stack);
}
} // End of the for loop for letterIndices
// Close the HTML
result += '</body></html>';
console.log('Finished processing all letters. Final HTML size:', result.length);
console.log(`Total rows processed across all sections: ${totalRowsProcessed}`);
console.log(`=== Phone book scraping process complete ===`);
return result;
} // End of scrapePhoneBook function
// Main function to run the scraper
async function runPhoneBookScraper() {
console.log('Starting phone book scraper');
// Process the phone book directly
let htmlContent = await scrapePhoneBook();
// Save as file and convert to CSV
console.log('Creating download for HTML file...');
try {
const blob = new Blob([htmlContent], {type: 'text/html'});
const url = URL.createObjectURL(blob);
console.log('Blob URL created:', url);
const a = document.createElement('a');
a.href = url;
a.download = 'PhoneBook_content.html';
console.log('Triggering download with filename:', a.download);
a.click();
console.log('Download triggered');
// After downloading HTML, call the CSV converter
console.log('Starting CSV conversion process...');
const csvContent = phoneBookCSVConverter(htmlContent);
console.log('CSV conversion completed and download triggered from converter');
} catch (downloadError) {
console.error('Error creating download:', downloadError);
}
return htmlContent;
}
// Run the function
console.log('Starting script execution');
runPhoneBookScraper()
.then(content => {
console.log('All content processed successfully');
console.log(`Final content size: ${content.length} characters`);
})
.catch(error => {
console.error('Fatal error in runPhoneBookScraper:', error);
console.error('Stack trace:', error.stack);
});
console.log('Script initialized');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment