Skip to content

Instantly share code, notes, and snippets.

@NeoHBz
Created February 17, 2025 14:04
Show Gist options
  • Save NeoHBz/9cfb6255390036249c7db0c44ef92443 to your computer and use it in GitHub Desktop.
Save NeoHBz/9cfb6255390036249c7db0c44ef92443 to your computer and use it in GitHub Desktop.
Search query inside Excel and PDFs
#!/usr/bin/env python3
# requirements: pandas PyPDF2 openpyxl xlrd
# usage: python search.py -s "search term" <folder_path>
import os
import argparse
import pandas as pd
import PyPDF2
import re
from openpyxl import load_workbook
import xlrd
print_not_found = False
def search_pdf(file_path, search_term):
"""Search for a term in all pages of a PDF file."""
results = []
try:
with open(file_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page_num, page in enumerate(pdf_reader.pages):
text = page.extract_text()
if text and search_term.lower() in text.lower():
results.append(f"Page {page_num + 1}")
return results
except Exception as e:
return [f"Error processing file: {str(e)}"]
def search_excel(file_path, search_term):
"""Search for a term in all worksheets of an Excel file."""
results = []
# Determine if this is .xlsx or .xls format
if file_path.endswith('.xlsx'):
try:
workbook = load_workbook(file_path, read_only=True, data_only=True)
for sheet_name in workbook.sheetnames:
sheet = workbook[sheet_name]
for row in sheet.iter_rows():
for cell in row:
if cell.value and search_term.lower() in str(cell.value).lower():
results.append(f"Sheet '{sheet_name}', cell {cell.coordinate}")
except Exception as e:
results.append(f"Error processing file: {str(e)}")
else: # .xls format
try:
workbook = xlrd.open_workbook(file_path)
for sheet_index in range(workbook.nsheets):
sheet = workbook.sheet_by_index(sheet_index)
sheet_name = sheet.name
for row_idx in range(sheet.nrows):
for col_idx in range(sheet.ncols):
cell_value = sheet.cell_value(row_idx, col_idx)
if cell_value and search_term.lower() in str(cell_value).lower():
cell_id = f"{chr(65 + col_idx)}{row_idx + 1}"
results.append(f"Sheet '{sheet_name}', cell {cell_id}")
except Exception as e:
results.append(f"Error processing file: {str(e)}")
return results
def analyze_folder(folder_path, search_term=None):
"""Analyze a folder containing PDF and Excel files, optionally searching for a term."""
if not os.path.isdir(folder_path):
print(f"Error: {folder_path} is not a valid directory")
return
summary = {
'pdf_files': 0,
'excel_files': 0,
'other_files': 0,
'pdf_files_list': [],
'excel_files_list': [],
'other_files_list': [],
'search_results': {},
'files_with_matches': {
'pdf': [],
'excel': []
},
'files_without_matches': {
'pdf': [],
'excel': []
}
}
for root, _, files in os.walk(folder_path):
for filename in files:
file_path = os.path.join(root, filename)
file_ext = os.path.splitext(filename)[1].lower()
if file_ext in ['.pdf']:
summary['pdf_files'] += 1
summary['pdf_files_list'].append(file_path)
if search_term:
results = search_pdf(file_path, search_term)
summary['search_results'][file_path] = results
if results:
summary['files_with_matches']['pdf'].append(file_path)
else:
summary['files_without_matches']['pdf'].append(file_path)
elif file_ext in ['.xls', '.xlsx']:
summary['excel_files'] += 1
summary['excel_files_list'].append(file_path)
if search_term:
results = search_excel(file_path, search_term)
summary['search_results'][file_path] = results
if results:
summary['files_with_matches']['excel'].append(file_path)
else:
summary['files_without_matches']['excel'].append(file_path)
else:
summary['other_files'] += 1
summary['other_files_list'].append(file_path)
return summary
def print_summary(summary, search_term):
"""Print a summary of the analysis results with separated matches information."""
print("\n--- FOLDER ANALYSIS SUMMARY ---")
print(f"PDF Files: {summary['pdf_files']}")
print(f"Excel Files: {summary['excel_files']}")
print(f"Other Files: {summary['other_files']}")
if search_term:
print("\n--- SEARCH ANALYSIS SUMMARY ---")
pdf_match_count = len(summary['files_with_matches']['pdf'])
excel_match_count = len(summary['files_with_matches']['excel'])
print(f"PDF Files with matches: {pdf_match_count}")
if pdf_match_count > 0:
for file_path in summary['files_with_matches']['pdf']:
print(f" {os.path.basename(file_path)}")
print(f"Excel Files with matches: {excel_match_count}")
if excel_match_count > 0:
for file_path in summary['files_with_matches']['excel']:
print(f" {os.path.basename(file_path)}")
print(f"Other Files with matches: 0")
if print_not_found:
print("\nNot found in:")
for file_type in ['pdf', 'excel']:
for file_path in summary['files_without_matches'][file_type]:
print(f" {os.path.basename(file_path)}")
if pdf_match_count > 0 or excel_match_count > 0:
print("\nFound in:")
for file_type in ['pdf', 'excel']:
for file_path in summary['files_with_matches'][file_type]:
results = summary['search_results'][file_path]
for result in results:
print(f" {os.path.basename(file_path)}: {result}")
print("\n")
def main():
parser = argparse.ArgumentParser(description='Analyze and search through PDF and Excel files in a folder')
parser.add_argument('folder', help='Path to the folder to analyze')
parser.add_argument('-s', '--search', help='Term to search for in the files')
args = parser.parse_args()
summary = analyze_folder(args.folder, args.search)
if summary:
print_summary(summary, args.search)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment