Created
February 17, 2025 14:04
-
-
Save NeoHBz/9cfb6255390036249c7db0c44ef92443 to your computer and use it in GitHub Desktop.
Search query inside Excel and PDFs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# requirements: pandas PyPDF2 openpyxl xlrd | |
# usage: python search.py -s "search term" <folder_path> | |
import os | |
import argparse | |
import pandas as pd | |
import PyPDF2 | |
import re | |
from openpyxl import load_workbook | |
import xlrd | |
print_not_found = False | |
def search_pdf(file_path, search_term): | |
"""Search for a term in all pages of a PDF file.""" | |
results = [] | |
try: | |
with open(file_path, 'rb') as file: | |
pdf_reader = PyPDF2.PdfReader(file) | |
for page_num, page in enumerate(pdf_reader.pages): | |
text = page.extract_text() | |
if text and search_term.lower() in text.lower(): | |
results.append(f"Page {page_num + 1}") | |
return results | |
except Exception as e: | |
return [f"Error processing file: {str(e)}"] | |
def search_excel(file_path, search_term): | |
"""Search for a term in all worksheets of an Excel file.""" | |
results = [] | |
# Determine if this is .xlsx or .xls format | |
if file_path.endswith('.xlsx'): | |
try: | |
workbook = load_workbook(file_path, read_only=True, data_only=True) | |
for sheet_name in workbook.sheetnames: | |
sheet = workbook[sheet_name] | |
for row in sheet.iter_rows(): | |
for cell in row: | |
if cell.value and search_term.lower() in str(cell.value).lower(): | |
results.append(f"Sheet '{sheet_name}', cell {cell.coordinate}") | |
except Exception as e: | |
results.append(f"Error processing file: {str(e)}") | |
else: # .xls format | |
try: | |
workbook = xlrd.open_workbook(file_path) | |
for sheet_index in range(workbook.nsheets): | |
sheet = workbook.sheet_by_index(sheet_index) | |
sheet_name = sheet.name | |
for row_idx in range(sheet.nrows): | |
for col_idx in range(sheet.ncols): | |
cell_value = sheet.cell_value(row_idx, col_idx) | |
if cell_value and search_term.lower() in str(cell_value).lower(): | |
cell_id = f"{chr(65 + col_idx)}{row_idx + 1}" | |
results.append(f"Sheet '{sheet_name}', cell {cell_id}") | |
except Exception as e: | |
results.append(f"Error processing file: {str(e)}") | |
return results | |
def analyze_folder(folder_path, search_term=None): | |
"""Analyze a folder containing PDF and Excel files, optionally searching for a term.""" | |
if not os.path.isdir(folder_path): | |
print(f"Error: {folder_path} is not a valid directory") | |
return | |
summary = { | |
'pdf_files': 0, | |
'excel_files': 0, | |
'other_files': 0, | |
'pdf_files_list': [], | |
'excel_files_list': [], | |
'other_files_list': [], | |
'search_results': {}, | |
'files_with_matches': { | |
'pdf': [], | |
'excel': [] | |
}, | |
'files_without_matches': { | |
'pdf': [], | |
'excel': [] | |
} | |
} | |
for root, _, files in os.walk(folder_path): | |
for filename in files: | |
file_path = os.path.join(root, filename) | |
file_ext = os.path.splitext(filename)[1].lower() | |
if file_ext in ['.pdf']: | |
summary['pdf_files'] += 1 | |
summary['pdf_files_list'].append(file_path) | |
if search_term: | |
results = search_pdf(file_path, search_term) | |
summary['search_results'][file_path] = results | |
if results: | |
summary['files_with_matches']['pdf'].append(file_path) | |
else: | |
summary['files_without_matches']['pdf'].append(file_path) | |
elif file_ext in ['.xls', '.xlsx']: | |
summary['excel_files'] += 1 | |
summary['excel_files_list'].append(file_path) | |
if search_term: | |
results = search_excel(file_path, search_term) | |
summary['search_results'][file_path] = results | |
if results: | |
summary['files_with_matches']['excel'].append(file_path) | |
else: | |
summary['files_without_matches']['excel'].append(file_path) | |
else: | |
summary['other_files'] += 1 | |
summary['other_files_list'].append(file_path) | |
return summary | |
def print_summary(summary, search_term): | |
"""Print a summary of the analysis results with separated matches information.""" | |
print("\n--- FOLDER ANALYSIS SUMMARY ---") | |
print(f"PDF Files: {summary['pdf_files']}") | |
print(f"Excel Files: {summary['excel_files']}") | |
print(f"Other Files: {summary['other_files']}") | |
if search_term: | |
print("\n--- SEARCH ANALYSIS SUMMARY ---") | |
pdf_match_count = len(summary['files_with_matches']['pdf']) | |
excel_match_count = len(summary['files_with_matches']['excel']) | |
print(f"PDF Files with matches: {pdf_match_count}") | |
if pdf_match_count > 0: | |
for file_path in summary['files_with_matches']['pdf']: | |
print(f" {os.path.basename(file_path)}") | |
print(f"Excel Files with matches: {excel_match_count}") | |
if excel_match_count > 0: | |
for file_path in summary['files_with_matches']['excel']: | |
print(f" {os.path.basename(file_path)}") | |
print(f"Other Files with matches: 0") | |
if print_not_found: | |
print("\nNot found in:") | |
for file_type in ['pdf', 'excel']: | |
for file_path in summary['files_without_matches'][file_type]: | |
print(f" {os.path.basename(file_path)}") | |
if pdf_match_count > 0 or excel_match_count > 0: | |
print("\nFound in:") | |
for file_type in ['pdf', 'excel']: | |
for file_path in summary['files_with_matches'][file_type]: | |
results = summary['search_results'][file_path] | |
for result in results: | |
print(f" {os.path.basename(file_path)}: {result}") | |
print("\n") | |
def main(): | |
parser = argparse.ArgumentParser(description='Analyze and search through PDF and Excel files in a folder') | |
parser.add_argument('folder', help='Path to the folder to analyze') | |
parser.add_argument('-s', '--search', help='Term to search for in the files') | |
args = parser.parse_args() | |
summary = analyze_folder(args.folder, args.search) | |
if summary: | |
print_summary(summary, args.search) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment