Created
October 21, 2020 02:15
-
-
Save ttchengab/293fc3ca782b20cf9b05c33f13583338 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
get_info() function reads the image using openCV and performs thresholding, dilation, noise removal, and | |
contouring to finally retrieve bounding boxes from the contour. | |
Below are some of the additional available functions from openCV for preprocessing: | |
Median filter: median filter blurs out noises by taking the medium from a set of pixels | |
cv2.medianBlur() | |
Dilation and erosion: dilation adds pixels to boundaries of pixels, erosion removes it | |
cv2.dilate() | |
cv2.erode() | |
cv2.opening() #This is an erosion followed by a dilation | |
""" | |
def get_info(path): | |
font = cv2.FONT_HERSHEY_SIMPLEX | |
fontScale = 0.5 | |
fontColor = (255,0,0) | |
lineType = 1 | |
#Threshold | |
image = cv2.imread(path) | |
height,width,channel = image.shape | |
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
T = threshold_local(gray, 15, offset = 6, method = "gaussian") # generic, mean, median, gaussian | |
thresh = (gray > T).astype("uint8") * 255 | |
thresh = ~thresh | |
#Dilation | |
kernel =np.ones((1,1), np.uint8) | |
ero = cv2.erode(thresh, kernel, iterations= 1) | |
img_dilation = cv2.dilate(ero, kernel, iterations=1) | |
# Remove noise | |
nlabels, labels, stats, centroids = cv2.connectedComponentsWithStats(img_dilation, None, None, None, 8, cv2.CV_32S) | |
sizes = stats[1:, -1] #get CC_STAT_AREA component | |
final = np.zeros((labels.shape), np.uint8) | |
for i in range(0, nlabels - 1): | |
if sizes[i] >= 10: #filter small dotted regions | |
final[labels == i + 1] = 255 | |
#Find contours | |
kern = np.ones((5,15), np.uint8) | |
img_dilation = cv2.dilate(final, kern, iterations = 1) | |
contours, hierarchy = cv2.findContours(img_dilation, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE) | |
# Map contours to bounding rectangles, using bounding_rect property | |
rects = map(lambda c: cv2.boundingRect(c), contours) | |
# Sort rects by top-left x (rect.x == rect.tl.x) | |
sorted_rects = sorted(rects, key =lambda r: r[0]) | |
sorted_rects = sorted(sorted_rects, key =lambda r: r[1]) | |
etfo='' | |
for rect in sorted_rects: | |
x,y,w,h = rect | |
if(w<20 or h<20): | |
continue | |
temp = image[y:y+h, x:x+w] | |
temp = cv2.cvtColor(temp, cv2.COLOR_BGR2RGB) | |
hi = pytesseract.image_to_data(temp, config=r'--psm 6') | |
hi = hi.split() | |
ind = 22 | |
while(True): | |
if (ind>len(hi)): | |
break | |
if(int(hi[ind])==-1): | |
ind+=11 | |
else: | |
etfo=etfo+hi[ind+1] | |
etfo=etfo+" " | |
x+=len(hi[ind+1])*20 | |
ind+=12 | |
etfo=etfo+'\n' | |
return etfo |
pil_images = pdf2image.convert_from_path(your_multipage_pdf_file_here, dpi=200, output_folder='/tmp', first_page=None, last_page=None, fmt='JPG', thread_count=1, userpw=None, use_cropbox=False, strict=False)
this is just the meat, I let your figure out the rest.
Here where is the image path, you're not assigning any image or pdf to the path? Then how do you take the tested image or pdf file?
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is great solution, I have images saved in pdf files with multiple pages. can you suggest how to apply this on the pdfs which are basically scanned documents save as pdf.