Last active
April 11, 2025 11:37
-
-
Save thiswillbeyourgithub/036df5b4f1d8c48950c085c42955e099 to your computer and use it in GitHub Desktop.
Phone book downloader
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* Converts HTML phone book to CSV format | |
* @param {string} htmlContent - The HTML content to convert | |
* @returns {string} The converted CSV content | |
*/ | |
function phoneBookCSVConverter(htmlContent) { | |
console.log('Starting CSV conversion...'); | |
try { | |
// Create a DOM parser to work with the HTML content | |
const parser = new DOMParser(); | |
const doc = parser.parseFromString(htmlContent, 'text/html'); | |
// Find all TabSmall_Row_Item_Alt and TabSmall_Row_Item elements | |
const rows = doc.querySelectorAll('.TabSmall_Row_Item_Alt, .TabSmall_Row_Item'); | |
console.log(`Found ${rows.length} TabSmall_Row_Item_Alt and TabSmall_Row_Item elements`); | |
// If no rows are found in the direct HTML, try looking in each section | |
let allRows = []; | |
if (rows.length === 0) { | |
console.log('No rows found directly, trying to extract from sections...'); | |
const sections = doc.querySelectorAll('section'); | |
for (const section of sections) { | |
const sectionRows = section.querySelectorAll('tr'); | |
allRows = [...allRows, ...sectionRows]; | |
} | |
console.log(`Found ${allRows.length} rows from all sections`); | |
} else { | |
allRows = rows; | |
} | |
// Assign specific classes to table cells in rows with 6 td elements | |
allRows.forEach(row => { | |
const cells = row.querySelectorAll('td'); | |
if (cells.length === 6) { | |
// Assign classes to each td based on position | |
cells[0].classList.add('nom'); | |
cells[1].classList.add('prenom'); | |
cells[2].classList.add('localisation'); | |
cells[3].classList.add('numero'); | |
cells[4].classList.add('numero_sda'); | |
cells[5].classList.add('email'); | |
} | |
}); | |
// Replace textContent with title attribute for localisation elements | |
const hierarchieElements = doc.querySelectorAll('.localisation'); | |
hierarchieElements.forEach(element => { | |
const titleAttr = element.getAttribute('title'); | |
if (titleAttr) { | |
element.textContent = titleAttr; | |
} | |
}); | |
// Prepare headers for CSV | |
const headers = 'nom,prenom,localisation,numero,numero_sda,email'; | |
// Create a Set to track unique rows for deduplication | |
const uniqueRows = new Set(); | |
// Array to store all valid rows for sorting | |
const validRows = []; | |
// Escape function for CSV fields | |
const escapeForCSV = (text) => { | |
text = text.replace(/"/g, '""'); // Double quotes within field | |
return text.includes(',') ? `"${text}"` : text; // Wrap in quotes if contains comma | |
}; | |
// Process each row | |
allRows.forEach(row => { | |
// Get values for each column by their class names | |
const nom = row.querySelector('.nom')?.textContent.trim() || ''; | |
const prenom = row.querySelector('.prenom')?.textContent.trim() || ''; | |
const hierarchie = row.querySelector('.localisation')?.textContent.trim() || ''; | |
const numero = row.querySelector('.numero')?.textContent.trim() || ''; | |
const numero_sda = row.querySelector('.numero_sda')?.textContent.trim() || ''; | |
const email = row.querySelector('.email')?.textContent.trim() || ''; | |
// Only include rows that have at least one of numero, numero_sda, or email | |
// AND where nom + prenom + localisation is at least 3 characters long | |
if ((numero || numero_sda || email) && (nom.length + prenom.length + hierarchie.length >= 3)) { | |
// Create a row with the properly escaped values | |
const rowData = [ | |
escapeForCSV(nom), | |
escapeForCSV(prenom), | |
escapeForCSV(hierarchie), | |
escapeForCSV(numero), | |
escapeForCSV(numero_sda), | |
escapeForCSV(email) | |
]; | |
// Create a string representation for deduplication | |
const rowString = rowData.join(','); | |
// Only add unique rows | |
if (!uniqueRows.has(rowString)) { | |
uniqueRows.add(rowString); | |
validRows.push({ | |
sortKey: `${nom.toLowerCase()}${prenom.toLowerCase()}`, | |
data: rowString | |
}); | |
} | |
} | |
}); | |
// Sort the rows lexicographically by nom+prenom | |
validRows.sort((a, b) => a.sortKey.localeCompare(b.sortKey)); | |
// Build the final CSV content | |
let csvContent = headers + '\n'; | |
validRows.forEach(row => { | |
csvContent += row.data + '\n'; | |
}); | |
console.log(`CSV conversion complete. Generated ${validRows.length} data rows after filtering and deduplication`); | |
// Create a date string in readable format (YYYY-MM-DD) | |
const now = new Date(); | |
const dateStr = now.toISOString().split('T')[0]; // Gets YYYY-MM-DD format | |
// Download the CSV file with date in the filename | |
const blob = new Blob([csvContent], {type: 'text/csv;charset=utf-8'}); | |
const url = URL.createObjectURL(blob); | |
const link = document.createElement('a'); | |
link.href = url; | |
link.download = `PhoneBook_${dateStr}.csv`; | |
console.log('Triggering CSV download with filename:', link.download); | |
link.click(); | |
console.log('CSV download triggered'); | |
return csvContent; | |
} catch (error) { | |
console.error('Error converting HTML to CSV:', error); | |
console.error('Stack trace:', error.stack); | |
return 'Error,during,conversion\n'; | |
} | |
} | |
// Helper function to find elements across document and all frames | |
function findElementInAllFrames(selector, findAll = false) { | |
// First try the main document | |
if (findAll) { | |
const mainDocResults = document.querySelectorAll(selector); | |
if (mainDocResults && mainDocResults.length > 0) { | |
return mainDocResults; | |
} | |
} else { | |
const mainDocResult = document.querySelector(selector); | |
if (mainDocResult) { | |
return mainDocResult; | |
} | |
} | |
// If not found, try each frame | |
try { | |
const frameCount = window.frames.length; | |
for (let i = 0; i < frameCount; i++) { | |
try { | |
const frame = window.frames[i]; | |
const frameDoc = frame.document; | |
if (!frameDoc) { | |
continue; | |
} | |
if (findAll) { | |
const frameResults = frameDoc.querySelectorAll(selector); | |
if (frameResults && frameResults.length > 0) { | |
return frameResults; | |
} | |
} else { | |
const frameResult = frameDoc.querySelector(selector); | |
if (frameResult) { | |
return frameResult; | |
} | |
} | |
} catch (frameError) { | |
console.warn(`Error accessing frame ${i}:`, frameError); | |
} | |
} | |
} catch (framesError) { | |
console.error('Error accessing frames:', framesError); | |
} | |
return findAll ? [] : null; | |
} | |
// Define the scrapePhoneBook function with async to allow await usage | |
async function scrapePhoneBook() { | |
// Start the search for letter elements | |
console.log('Starting letter element search...'); | |
// Find all letter elements using the specified selector pattern | |
let letterElements = []; | |
let index = 1; | |
let element; | |
// Original strategy with the specific selector | |
console.log('Using specific selector strategy to find letter elements'); | |
// Check positions 1 through 35 (letters A-Z plus potential additional elements) | |
while (index <= 35) { | |
element = findElementInAllFrames(`#tabsABC > tbody > tr > td:nth-child(${index}) > a`); | |
if (element !== null) { | |
letterElements.push(element); | |
} | |
index++; | |
} | |
console.log(`Found ${letterElements.length} letter elements`); | |
// Log the letter elements we found | |
console.log(`Final result: Found ${letterElements.length} letter elements to process`); | |
// Log each letter and its properties for diagnosis | |
letterElements.forEach((el, idx) => { | |
console.log(`Letter ${idx + 1}: "${el.textContent.trim()}" href=${el.getAttribute('href')}`); | |
}); | |
// Proceed with at least some letters, but warn if we don't have the expected count | |
if (letterElements.length < 26) { | |
console.warn(`Expected 26 letter elements, but found only ${letterElements.length}. Will attempt to proceed anyway.`); | |
} | |
if (letterElements.length === 0) { | |
throw new Error('Could not find any letter elements after multiple attempts. Aborting.'); | |
} | |
let result = '<html><body>'; | |
// Store the total number of letter elements we originally found | |
const totalLetterElements = letterElements.length; | |
// Track the total number of table rows processed so far | |
let totalRowsProcessed = 0; | |
// Instead of relying on reverse order, find Z explicitly and reorder the array | |
// to ensure we start with Z, then Y, X, etc. | |
const letterContents = letterElements.map(el => el.textContent.trim()); | |
console.log(`Found letter elements: ${letterContents.join(', ')}`); | |
// Find the index of Z in the original array | |
let zIndex = -1; | |
for (let i = 0; i < letterElements.length; i++) { | |
if (letterElements[i].textContent.trim() === 'Z') { | |
zIndex = i; | |
break; | |
} | |
} | |
// Create the indices array ensuring Z is first, then Y, X, etc. | |
let letterIndices = []; | |
if (zIndex !== -1) { | |
console.log(`Found Z at index ${zIndex}`); | |
// Start with Z and go backwards through the alphabet | |
for (let i = 0; i < totalLetterElements; i++) { | |
// Calculate the index: start with Z's index, then go backwards wrapping around if needed | |
let letterIdx = (zIndex - i + totalLetterElements) % totalLetterElements; | |
letterIndices.push(letterIdx); | |
} | |
} else { | |
console.warn("Z not found! Falling back to reverse order"); | |
// Fallback to reverse order | |
letterIndices = Array.from({length: totalLetterElements}, (_, i) => totalLetterElements - i - 1); | |
} | |
// Log the letter sequence for debugging | |
const letterSequence = letterIndices.map(idx => letterElements[idx]?.textContent.trim() || '?').join(', '); | |
console.log(`Letter elements will be processed in this order: ${letterSequence}`); | |
// Verify the first letter is indeed Z | |
if (letterElements.length > 0) { | |
const firstLetter = letterElements[letterIndices[0]]?.textContent.trim() || '?'; | |
console.log(`First letter to process: "${firstLetter}"`); | |
if (firstLetter !== 'Z') { | |
console.warn(`Warning: First letter is not Z as expected!`); | |
} | |
} | |
// Process all letters the same way | |
for (let letterPos = 0; letterPos < letterIndices.length; letterPos++) { | |
const i = letterIndices[letterPos]; | |
let clickSuccess = false; // Initialize clickSuccess for each letter | |
console.log(`\n==== Processing letter index ${totalLetterElements-i}/${totalLetterElements} (actual index: ${i}) ====`); | |
console.log(`Processing letter index ${totalLetterElements-i} (actual index: ${i})`); | |
// Reload the letter elements before each iteration to ensure we're working with fresh DOM elements | |
console.log(`Reloading letter elements for index ${i+1}`); | |
letterElements = []; | |
// Use original strategy with the specific selector | |
let index = 1; | |
while (index <= 35) { | |
const element = findElementInAllFrames(`#tabsABC > tbody > tr > td:nth-child(${index}) > a`); | |
if (element !== null) { | |
letterElements.push(element); | |
} | |
index++; | |
} | |
console.log(`Reloaded ${letterElements.length} letter elements for processing index ${i+1}`); | |
// Check if we have enough elements | |
if (i >= letterElements.length) { | |
console.error(`Index ${i+1} is out of bounds for reloaded elements array (length: ${letterElements.length})`); | |
continue; | |
} | |
// Get the current letter element | |
const letterElement = letterElements[i]; | |
// Define letter outside try block so it's available in the catch block | |
let letter = "unknown"; | |
try { | |
// Only set letter if letterElement exists | |
if (letterElement) { | |
letter = letterElement.textContent.trim(); | |
} | |
console.log(`Processing letter ${letter} (${i+1}/${totalLetterElements})`); | |
console.log(`Processing letter "${letter}"`); | |
// Use the letterElement directly as our link | |
const letterLink = letterElement; | |
console.log(`Processing letter element for "${letter}"`); | |
// Log the element we're about to click | |
console.log(`Processing letter element:`, { | |
text: letterLink.textContent, | |
href: letterLink.getAttribute('href'), | |
id: letterLink.id, | |
classes: letterLink.className | |
}); | |
console.log(`Clicking on letter "${letter}" link`); | |
let clickAttempts = 0; | |
const maxClickAttempts = 2; // Allow 2 attempts for all letters for better reliability | |
// Defensive check - ensure letterLink is not null before proceeding | |
if (!letterLink) { | |
console.error(`Letter link for "${letter}" is null or undefined. Skipping this letter.`); | |
continue; | |
} | |
// Validate the link is properly configured before attempting to click | |
console.log(`Checking letter link properties for "${letter}":`, { | |
exists: !!letterLink, | |
isElement: letterLink instanceof Element, | |
hasClick: typeof letterLink.click === 'function', | |
href: letterLink.getAttribute('href') | |
}); | |
while (!clickSuccess && clickAttempts < maxClickAttempts) { | |
clickAttempts++; | |
console.log(`Click attempt ${clickAttempts}/${maxClickAttempts} for letter "${letter}"`); | |
try { | |
// Regular click method | |
if (typeof letterLink.click === 'function') { | |
letterLink.click(); | |
} else { | |
console.warn(`Letter "${letter}" element does not have click method`); | |
} | |
// Wait for page to load with a fixed delay | |
console.log(`Waiting for page to load after clicking "${letter}" (attempt ${clickAttempts})...`); | |
const startWait = Date.now(); | |
// Fixed wait time of 2 seconds for better reliability | |
await new Promise(resolve => setTimeout(resolve, 2000)); | |
clickSuccess = true; | |
console.log(`Waited ${Date.now() - startWait}ms after clicking "${letter}"`); | |
console.log(`Proceeding with letter "${letter}" after ${clickAttempts} attempts`); | |
} catch (clickError) { | |
console.error(`Error during click attempt ${clickAttempts} for letter "${letter}":`, clickError); | |
await new Promise(resolve => setTimeout(resolve, 1000)); | |
} | |
} | |
if (!clickSuccess) { | |
console.error(`Failed to navigate to letter "${letter}" after multiple attempts`); | |
// Try to continue with the next letter rather than halting | |
continue; | |
} | |
// Add a small additional wait to ensure page is fully rendered | |
await new Promise(resolve => setTimeout(resolve, 500)); | |
// No need to check if page content changed, just log current URL for debugging | |
console.log(`Current URL after clicking and waiting: ${window.location.href}`); | |
// Simplified approach: directly find table rows for current letter | |
console.log(`Looking for table rows for letter "${letter}"`); | |
const rows = findElementInAllFrames('tr', true); | |
console.log(`Found ${rows.length} total rows in the document`); | |
// Filter out rows that are likely navigation/header rows (usually shorter) | |
const contentRows = Array.from(rows).filter(row => { | |
// Skip very short rows or rows with mostly links (likely navigation) | |
return row.cells && row.cells.length > 1 && | |
row.textContent.trim().length > 5; | |
}); | |
console.log(`Filtered to ${contentRows.length} content rows for letter "${letter}"`); | |
// Track the previous count and update the total | |
const prevRowCount = totalRowsProcessed; | |
totalRowsProcessed += contentRows.length; | |
console.log(`Letter "${letter}": Added ${contentRows.length} rows. Total rows so far: ${totalRowsProcessed}`); | |
// Check if this iteration added any new rows - just log a warning but continue processing | |
if (totalRowsProcessed <= prevRowCount) { | |
console.warn(`No new rows were added for letter "${letter}" (still at ${totalRowsProcessed} rows)`); | |
// Process the section even if no new rows were found | |
} | |
// Add a section with id attribute for each letter | |
// For the "*" special character, use "all" as the ID | |
const letterId = letter === '*' ? 'all' : letter.toLowerCase(); | |
result += `<section id="letter-${letterId}">\n`; | |
result += `<h2>${letter === '*' ? 'All' : 'Letter ' + letter}</h2>\n`; | |
// Create a table with these rows | |
result += '<table>\n'; | |
for (const row of contentRows) { | |
result += row.outerHTML + '\n'; | |
} | |
result += '</table>\n'; | |
result += '</section>\n\n'; | |
// Check if we found any rows for this letter - warn if not but don't crash | |
if (contentRows.length === 0) { | |
console.warn(`No rows found for letter "${letter}" - will continue with next letter`); | |
} | |
console.log(`Successfully processed letter "${letter}" with ${contentRows.length} entries`); | |
// Print current accumulated content size | |
console.log(`Current accumulated HTML size: ${result.length} characters`); | |
} catch (error) { | |
console.error(`Error processing letter "${letter}":`, error); | |
console.error(`Stack trace:`, error.stack); | |
} | |
} // End of the for loop for letterIndices | |
// Close the HTML | |
result += '</body></html>'; | |
console.log('Finished processing all letters. Final HTML size:', result.length); | |
console.log(`Total rows processed across all sections: ${totalRowsProcessed}`); | |
console.log(`=== Phone book scraping process complete ===`); | |
return result; | |
} // End of scrapePhoneBook function | |
// Main function to run the scraper | |
async function runPhoneBookScraper() { | |
console.log('Starting phone book scraper'); | |
// Process the phone book directly | |
let htmlContent = await scrapePhoneBook(); | |
// Save as file and convert to CSV | |
console.log('Creating download for HTML file...'); | |
try { | |
const blob = new Blob([htmlContent], {type: 'text/html'}); | |
const url = URL.createObjectURL(blob); | |
console.log('Blob URL created:', url); | |
const a = document.createElement('a'); | |
a.href = url; | |
a.download = 'PhoneBook_content.html'; | |
console.log('Triggering download with filename:', a.download); | |
a.click(); | |
console.log('Download triggered'); | |
// After downloading HTML, call the CSV converter | |
console.log('Starting CSV conversion process...'); | |
const csvContent = phoneBookCSVConverter(htmlContent); | |
console.log('CSV conversion completed and download triggered from converter'); | |
} catch (downloadError) { | |
console.error('Error creating download:', downloadError); | |
} | |
return htmlContent; | |
} | |
// Run the function | |
console.log('Starting script execution'); | |
runPhoneBookScraper() | |
.then(content => { | |
console.log('All content processed successfully'); | |
console.log(`Final content size: ${content.length} characters`); | |
}) | |
.catch(error => { | |
console.error('Fatal error in runPhoneBookScraper:', error); | |
console.error('Stack trace:', error.stack); | |
}); | |
console.log('Script initialized'); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment