NeoHBz · February 17, 2025 14:04
diff --git a/search.py b/search.py
 #!/usr/bin/env python3

 # requirements: pandas PyPDF2 openpyxl xlrd
 # usage: python search.py -s "search term" <folder_path>

 import os
 import argparse
 import pandas as pd
 import PyPDF2
 import re
 from openpyxl import load_workbook
 import xlrd

 print_not_found = False

 def search_pdf(file_path, search_term):
    """Search for a term in all pages of a PDF file."""
    results = []
    try:
        with open(file_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            for page_num, page in enumerate(pdf_reader.pages):
                text = page.extract_text()
                if text and search_term.lower() in text.lower():
                    results.append(f"Page {page_num + 1}")
        return results
    except Exception as e:
        return [f"Error processing file: {str(e)}"]

 def search_excel(file_path, search_term):
    """Search for a term in all worksheets of an Excel file."""
    results = []
    
    # Determine if this is .xlsx or .xls format
    if file_path.endswith('.xlsx'):
        try:
            workbook = load_workbook(file_path, read_only=True, data_only=True)
            for sheet_name in workbook.sheetnames:
                sheet = workbook[sheet_name]
                for row in sheet.iter_rows():
                    for cell in row:
                        if cell.value and search_term.lower() in str(cell.value).lower():
                            results.append(f"Sheet '{sheet_name}', cell {cell.coordinate}")
        except Exception as e:
            results.append(f"Error processing file: {str(e)}")
    else:  # .xls format
        try:
            workbook = xlrd.open_workbook(file_path)
            for sheet_index in range(workbook.nsheets):
                sheet = workbook.sheet_by_index(sheet_index)
                sheet_name = sheet.name
                for row_idx in range(sheet.nrows):
                    for col_idx in range(sheet.ncols):
                        cell_value = sheet.cell_value(row_idx, col_idx)
                        if cell_value and search_term.lower() in str(cell_value).lower():
                            cell_id = f"{chr(65 + col_idx)}{row_idx + 1}"
                            results.append(f"Sheet '{sheet_name}', cell {cell_id}")
        except Exception as e:
            results.append(f"Error processing file: {str(e)}")
    
    return results

 def analyze_folder(folder_path, search_term=None):
    """Analyze a folder containing PDF and Excel files, optionally searching for a term."""
    if not os.path.isdir(folder_path):
        print(f"Error: {folder_path} is not a valid directory")
        return
    
    summary = {
        'pdf_files': 0,
        'excel_files': 0,
        'other_files': 0,
        'pdf_files_list': [],
        'excel_files_list': [],
        'other_files_list': [],
        'search_results': {},
        'files_with_matches': {
            'pdf': [],
            'excel': []
        },
        'files_without_matches': {
            'pdf': [],
            'excel': []
        }
    }
    
    for root, _, files in os.walk(folder_path):
        for filename in files:
            file_path = os.path.join(root, filename)
            file_ext = os.path.splitext(filename)[1].lower()
            
            if file_ext in ['.pdf']:
                summary['pdf_files'] += 1
                summary['pdf_files_list'].append(file_path)
                if search_term:
                    results = search_pdf(file_path, search_term)
                    summary['search_results'][file_path] = results
                    if results:
                        summary['files_with_matches']['pdf'].append(file_path)
                    else:
                        summary['files_without_matches']['pdf'].append(file_path)
            
            elif file_ext in ['.xls', '.xlsx']:
                summary['excel_files'] += 1
                summary['excel_files_list'].append(file_path)
                if search_term:
                    results = search_excel(file_path, search_term)
                    summary['search_results'][file_path] = results
                    if results:
                        summary['files_with_matches']['excel'].append(file_path)
                    else:
                        summary['files_without_matches']['excel'].append(file_path)
            
            else:
                summary['other_files'] += 1
                summary['other_files_list'].append(file_path)
    
    return summary

 def print_summary(summary, search_term):
    """Print a summary of the analysis results with separated matches information."""
    print("\n--- FOLDER ANALYSIS SUMMARY ---")
    print(f"PDF Files: {summary['pdf_files']}")
    print(f"Excel Files: {summary['excel_files']}")
    print(f"Other Files: {summary['other_files']}")
    
    if search_term:
        print("\n--- SEARCH ANALYSIS SUMMARY ---")
        pdf_match_count = len(summary['files_with_matches']['pdf'])
        excel_match_count = len(summary['files_with_matches']['excel'])
        
        print(f"PDF Files with matches: {pdf_match_count}")
        if pdf_match_count > 0:
            for file_path in summary['files_with_matches']['pdf']:
                print(f"  {os.path.basename(file_path)}")
        
        print(f"Excel Files with matches: {excel_match_count}")
        if excel_match_count > 0:
            for file_path in summary['files_with_matches']['excel']:
                print(f"  {os.path.basename(file_path)}")
        
        print(f"Other Files with matches: 0")
        
        if print_not_found:
            print("\nNot found in:")
            for file_type in ['pdf', 'excel']:
                for file_path in summary['files_without_matches'][file_type]:
                    print(f"  {os.path.basename(file_path)}")
        
        if pdf_match_count > 0 or excel_match_count > 0:
            print("\nFound in:")
            for file_type in ['pdf', 'excel']:
                for file_path in summary['files_with_matches'][file_type]:
                    results = summary['search_results'][file_path]
                    for result in results:
                        print(f"  {os.path.basename(file_path)}: {result}")
        print("\n")

 def main():
    parser = argparse.ArgumentParser(description='Analyze and search through PDF and Excel files in a folder')
    parser.add_argument('folder', help='Path to the folder to analyze')
    parser.add_argument('-s', '--search', help='Term to search for in the files')
    
    args = parser.parse_args()
    
    summary = analyze_folder(args.folder, args.search)
    if summary:
        print_summary(summary, args.search)

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	# requirements: pandas PyPDF2 openpyxl xlrd
	# usage: python search.py -s "search term" <folder_path>

	import os
	import argparse
	import pandas as pd
	import PyPDF2
	import re
	from openpyxl import load_workbook
	import xlrd

	print_not_found = False

	def search_pdf(file_path, search_term):
	"""Search for a term in all pages of a PDF file."""
	results = []
	try:
	with open(file_path, 'rb') as file:
	pdf_reader = PyPDF2.PdfReader(file)
	for page_num, page in enumerate(pdf_reader.pages):
	text = page.extract_text()
	if text and search_term.lower() in text.lower():
	results.append(f"Page {page_num + 1}")
	return results
	except Exception as e:
	return [f"Error processing file: {str(e)}"]

	def search_excel(file_path, search_term):
	"""Search for a term in all worksheets of an Excel file."""
	results = []

	# Determine if this is .xlsx or .xls format
	if file_path.endswith('.xlsx'):
	try:
	workbook = load_workbook(file_path, read_only=True, data_only=True)
	for sheet_name in workbook.sheetnames:
	sheet = workbook[sheet_name]
	for row in sheet.iter_rows():
	for cell in row:
	if cell.value and search_term.lower() in str(cell.value).lower():
	results.append(f"Sheet '{sheet_name}', cell {cell.coordinate}")
	except Exception as e:
	results.append(f"Error processing file: {str(e)}")
	else: # .xls format
	try:
	workbook = xlrd.open_workbook(file_path)
	for sheet_index in range(workbook.nsheets):
	sheet = workbook.sheet_by_index(sheet_index)
	sheet_name = sheet.name
	for row_idx in range(sheet.nrows):
	for col_idx in range(sheet.ncols):
	cell_value = sheet.cell_value(row_idx, col_idx)
	if cell_value and search_term.lower() in str(cell_value).lower():
	cell_id = f"{chr(65 + col_idx)}{row_idx + 1}"
	results.append(f"Sheet '{sheet_name}', cell {cell_id}")
	except Exception as e:
	results.append(f"Error processing file: {str(e)}")

	return results

	def analyze_folder(folder_path, search_term=None):
	"""Analyze a folder containing PDF and Excel files, optionally searching for a term."""
	if not os.path.isdir(folder_path):
	print(f"Error: {folder_path} is not a valid directory")
	return

	summary = {
	'pdf_files': 0,
	'excel_files': 0,
	'other_files': 0,
	'pdf_files_list': [],
	'excel_files_list': [],
	'other_files_list': [],
	'search_results': {},
	'files_with_matches': {
	'pdf': [],
	'excel': []
	},
	'files_without_matches': {
	'pdf': [],
	'excel': []
	}
	}

	for root, _, files in os.walk(folder_path):
	for filename in files:
	file_path = os.path.join(root, filename)
	file_ext = os.path.splitext(filename)[1].lower()

	if file_ext in ['.pdf']:
	summary['pdf_files'] += 1
	summary['pdf_files_list'].append(file_path)
	if search_term:
	results = search_pdf(file_path, search_term)
	summary['search_results'][file_path] = results
	if results:
	summary['files_with_matches']['pdf'].append(file_path)
	else:
	summary['files_without_matches']['pdf'].append(file_path)

	elif file_ext in ['.xls', '.xlsx']:
	summary['excel_files'] += 1
	summary['excel_files_list'].append(file_path)
	if search_term:
	results = search_excel(file_path, search_term)
	summary['search_results'][file_path] = results
	if results:
	summary['files_with_matches']['excel'].append(file_path)
	else:
	summary['files_without_matches']['excel'].append(file_path)

	else:
	summary['other_files'] += 1
	summary['other_files_list'].append(file_path)

	return summary

	def print_summary(summary, search_term):
	"""Print a summary of the analysis results with separated matches information."""
	print("\n--- FOLDER ANALYSIS SUMMARY ---")
	print(f"PDF Files: {summary['pdf_files']}")
	print(f"Excel Files: {summary['excel_files']}")
	print(f"Other Files: {summary['other_files']}")

	if search_term:
	print("\n--- SEARCH ANALYSIS SUMMARY ---")
	pdf_match_count = len(summary['files_with_matches']['pdf'])
	excel_match_count = len(summary['files_with_matches']['excel'])

	print(f"PDF Files with matches: {pdf_match_count}")
	if pdf_match_count > 0:
	for file_path in summary['files_with_matches']['pdf']:
	print(f" {os.path.basename(file_path)}")

	print(f"Excel Files with matches: {excel_match_count}")
	if excel_match_count > 0:
	for file_path in summary['files_with_matches']['excel']:
	print(f" {os.path.basename(file_path)}")

	print(f"Other Files with matches: 0")

	if print_not_found:
	print("\nNot found in:")
	for file_type in ['pdf', 'excel']:
	for file_path in summary['files_without_matches'][file_type]:
	print(f" {os.path.basename(file_path)}")

	if pdf_match_count > 0 or excel_match_count > 0:
	print("\nFound in:")
	for file_type in ['pdf', 'excel']:
	for file_path in summary['files_with_matches'][file_type]:
	results = summary['search_results'][file_path]
	for result in results:
	print(f" {os.path.basename(file_path)}: {result}")
	print("\n")

	def main():
	parser = argparse.ArgumentParser(description='Analyze and search through PDF and Excel files in a folder')
	parser.add_argument('folder', help='Path to the folder to analyze')
	parser.add_argument('-s', '--search', help='Term to search for in the files')

	args = parser.parse_args()

	summary = analyze_folder(args.folder, args.search)
	if summary:
	print_summary(summary, args.search)

	if __name__ == "__main__":
	main()