Created
March 4, 2018 12:25
-
-
Save RidaAyed/a19fb18634e4d9b4998c55d8e8dfc85b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/bash | |
########################################## | |
## SCRIPT VERSION 1.0.5 ## | |
## AUTHOR: MARKUS (www.och-group.de) ## | |
## Requires apt-get install: ## | |
## libtiff-tools ## | |
## tesseract\* ## | |
## libtiff-dev ## | |
## pdftk ## | |
## imagemagick ## | |
########################################## | |
DATETIME=$(date +%Y-%m-%d"_"%H-%M-%S) | |
#startdir=$(pwd) | |
startdir=/home/ra/pi | |
RANDOMNUMBER=$(cat /dev/urandom | tr -dc A-Za-z0-9 | head -c 16) | |
outname=$DATETIME.pdf | |
tmpdir=/tmp/scan-$RANDOMNUMBER | |
echo "####### $DATETIME #########" | |
# Either Scan or use prepared *.tif files in folder named in first parameter | |
if [ -z "$1" ]; then | |
echo "####### TMPDIR $tmpdir ##########" | |
echo "####### OUTNAME $outname ##########" | |
mkdir $tmpdir | |
cd $tmpdir | |
echo "################## Scanning ###################" | |
scanResult=$(scanimage --page-width 221.121 --page-height 876.695 -l 0 -t 0 -x 221.121 -y 876.695 --ald=yes --overscan On --prepick=On -b --format=tiff --mode Color --resolution 300 --source 'ADF Duplex' --swcrop=yes --buffermode On --swdespeck 2 --swdeskew=yes --swskip 5% -d 'fujitsu:ScanSnap iX500:10443') | |
regexScan="\s+scanimage: no SANE devices found\s+" | |
if [[ " $scanResult " =~ $regexScan ]]; then | |
echo "!!!!!!! No scanner found !!!!!!!" | |
echo "SCANRESULT: "$scanResult | |
exit 1 | |
fi | |
echo "################## Scanned ####################" | |
else | |
# use existing folder (absolute path) | |
tmpdir="$1" | |
echo "####### TMPDIR $tmpdir ##########" | |
echo "####### OUTNAME $outname ##########" | |
cd $tmpdir | |
if [ ! -d "$tmpdir" ]; then | |
echo "tmpdir $tmpdir does not exist" | |
exit 1 | |
fi | |
fi | |
optimize_color() { | |
############################################################ | |
# Optimize Color of image | |
############################################################ | |
# call: # | |
# optimize_color <filename> # | |
# result: # | |
# <filename> # | |
############################################################ | |
thresholdc=0.91 | |
# optimize Colors --> test for colors | |
convert $1 -level 20%,80%,2.0 $1 | |
######## Other Color check variants - best is scale option, then breakup option | |
# testing average colorfulness of an image in HSL (green channel is colorfulness) http://www.imagemagick.org/discourse-server/viewtopic.php?t=19580 | |
#testc1=`convert $1 -colorspace HSV -channel g -separate +channel -format "%[fx:mean]" info:` | |
#testc2=`convert $1 -colorspace HSL -channel g -separate +channel -format "%[fx:mean]" info:` | |
#echo " PAGE: ${1%.*} this pic is grey if close to 0:" $testc1 "and" $testc2 | |
# Two methods from here http://www.imagemagick.org/discourse-server/viewtopic.php?f=1&t=29781 | |
#testc3=`convert $1 -crop 50x50 -colorspace HCL -scale 1x1! -channel G -separate +channel -evaluate-sequence Max -format %[fx:mean] info: 2>/dev/null` | |
#echo " PAGE: ${1%.*} breakup option says color value is" $testc3 | |
testc=`convert $1 -colorspace HCL -scale 2% -format "%[fx:maxima.g+$thresholdc>1?1:0]" info:` | |
echo " PAGE: ${1%.*} scale option says color exists for %:" `convert $1 -colorspace HCL -scale 2% -format "%[fx:maxima.g]" info:` | |
if [ $testc -eq 1 ]; then | |
echo " PAGE: ${1%.*} is colorful" | |
## OPTIMIZE COLORS http://www.imagemagick.org/Usage/color_mods/ | |
#convert $1 -level 20%,80%,2.5 ${1%.*}"_color.tif" | |
## Alternative Color optimization (for me it does not look as good as the first) | |
#convert $infile -sigmoidal-contrast 10,50% ${inname}_color_sigmoidal.tif | |
else | |
echo " PAGE: ${1%.*} is not colorful" | |
######### DITHER IS BEST FOR COLORED IMAGES! - tx is fine for text | |
#convert $1 -compress Group4 -adaptive-resize 75% -density 200 -type bilevel TIFF:- | convert - ${1%.*}"_compressed.pdf" | |
##Text Optimization: Convert to lineart | |
#convert $1 -negate -separate -lat 20x20+25% -negate -evaluate-sequence add ${1%.*}"_la.png" | |
##Text Optimization: dither to black / white picture | |
convert $1 +dither -colors 2 -colorspace gray -contrast-stretch 0 ${1%.*}"_dither.png" | |
#Text Optimization: lots of Magic | |
#convert -respect-parenthesis \ | |
#\( $1 -colorspace gray -type grayscale -contrast-stretch 0 \) \ | |
#\( -clone 0 -colorspace gray -negate -lat 15x15+10% -contrast-stretch 0 \) \ | |
#-compose copy_opacity -composite -fill white -opaque none +matte -deskew 40% +repage -sharpen 0x1 \ | |
#$1 | |
fi | |
} | |
optimize_crop() { | |
############################################################ | |
# Crop Image | |
############################################################ | |
# call: # | |
# optimize_crop <filename> # | |
# result: # | |
# <filename> # | |
############################################################ | |
##################################### CROPPED 2 IS BETTER! | |
# crop Borders if black 1 | |
#infile=$1 | |
#inname=${1%.*} | |
#convert $infile +repage -scale x1! -bordercolor black -border 1 -fuzz 30% -trim ${inname}_tmp1.png | |
#width=`convert ${inname}_tmp1.png -format "%w" info:` | |
#offsets=`convert ${inname}_tmp1.png -format "%O" info:` | |
#xoff=`echo $offsets | cut -d+ -f2` | |
#convert $infile +repage -scale 1x! -bordercolor black -border 1 -fuzz 60% -trim ${inname}_tmp2.png | |
#height=`convert ${inname}_tmp2.png -format "%h" info:` | |
#offsets=`convert ${inname}_tmp2.png -format "%O" info:` | |
#yoff=`echo $offsets | cut -d+ -f3` | |
#convert $infile -crop ${width}x${height}+${xoff}+${yoff} +repage ${inname}_cropped_1.jpg | |
# Crop Borders variant 2 | |
convert -fuzz 15% -trim $1 $1 | |
} | |
correct_orientation() { | |
######################################################## | |
# Orientation correction (rotate if 90,180,270 degree) # | |
######################################################## | |
# call: # | |
# correct_orientation <filename)> # | |
# result: # | |
# <filename> with correct orientation # | |
######################################################## | |
# Get info from tesseract without creating a pdf file | |
#tesseract -psm 0 -l eng+deu $1 result_${1%.*} 1>${1%.*}"_tesseract.info" 2>&1 | |
tesseract -psm 0 -l deu $1 result_${1%.*} 1>${1%.*}"_tesseract.info" 2>&1 | |
regexOrientation="\s+Orientation in degrees: ([0-9]{3})\s+" | |
file=${1%.*}"_tesseract.info" | |
file_content=$( cat "${file}" ) | |
orientation=0 | |
if [[ " $file_content " =~ $regexOrientation ]]; then | |
case "${BASH_REMATCH[1]}" in | |
'90') | |
# 90 is readable from the right side | |
#echo " PAGE: ${1%.*} Detected wrong orientation:" ${BASH_REMATCH[1]} | |
## Rotate picture | |
#convert $1 -rotate 90 +repage $1; | |
orientation=90 | |
;; | |
'180') | |
# 180 is upside down | |
echo " PAGE: ${1%.*} Detected wrong orientation:" ${BASH_REMATCH[1]} | |
# Rotate picture | |
convert $1 -rotate 180 $1 | |
orientation=180 | |
;; | |
'270') | |
# 270 is readable from left side | |
echo " PAGE: ${1%.*} Detected wrong orientation:" ${BASH_REMATCH[1]} | |
# Rotate picture | |
convert $1 -rotate 180 +repage $1 | |
orientation=270 | |
;; | |
*) | |
#echo " PAGE: ${1%.*} Detected correct orientation:" ${BASH_REMATCH[1]} | |
;; | |
esac | |
else | |
echo " PAGE: ${1%.*} Cannot find any orientation" | |
fi | |
############################################################ | |
# END Orientation correction # | |
############################################################ | |
} | |
correct_blank_page() { | |
############################################################ | |
# Test if it is a blank page | |
############################################################ | |
# call: # | |
# correct_blank_page <filename> # | |
# result: # | |
# <filename> or deleted file # | |
############################################################ | |
# Threshold for deleting blank pages (Percentage) | |
threshold=0.99 | |
# Test percentage of lineart against threshold | |
test=`convert $1 +dither -colors 2 -colorspace gray -contrast-stretch 0 -format "%[fx:mean>$threshold?1:0]" info:` | |
echo " PAGE: ${1%.*} is blank for %:" `convert $1 +dither -colors 2 -colorspace gray -contrast-stretch 0 -format "%[fx:mean]" info:` | |
if [ $test -eq 1 ]; then | |
echo " PAGE: ${1%.*} Blank Page (1/2 - GREY-CHECK): SEEMS TO BE A BLANK PAGE------" | |
BLANKPROSPECT=true | |
else | |
BLANKPROSPECT=false | |
return | |
fi | |
# If file does not exist - create it - else use it | |
if [ ! -f ${1%.*}"_tesseract.info" ]; then | |
#tesseract -psm 0 -l eng+deu $1 result_${1%.*} 1>${1%.*}"_tesseract.info" 2>&1 | |
tesseract -psm 0 -l deu $1 result_${1%.*} 1>${1%.*}"_tesseract.info" 2>&1 | |
fi | |
regexCharacters="\s+Too few characters. Skipping this page\s+" | |
file=${1%.*}"_tesseract.info" | |
file_content=$( cat "${file}" ) | |
if [[ " $file_content " =~ $regexCharacters ]]; then | |
echo " PAGE: ${1%.*} Blank Page (2/2 - CHARACTER-CHECK): SEEMS TO BE A BLANK PAGE------" | |
if [ "$BLANKPROSPECT" = "true" ]; then | |
# If both matches for blank page | |
echo " PAGE: ${1%.*} is a blank page - deleting $1------" | |
rm $1 | |
fi | |
fi | |
############################################################ | |
} | |
declare -a pids | |
waitProcessing() { | |
############################################################ | |
# parallelizable Process for working on each page | |
############################################################ | |
# usage: | |
# waitProcessing | |
############################################################ | |
# return: | |
# when all pids are processed, this one will end too | |
############################################################ | |
while [ ${#pids[@]} -ne 0 ]; do | |
#echo "Waiting for pids: ${pids[@]}" | |
local range=$(eval echo {0..$((${#pids[@]}-1))}) | |
local i | |
for i in $range; do | |
if ! kill -0 ${pids[$i]} 2> /dev/null; then | |
#echo "Done -- ${pids[$i]}" | |
unset pids[$i] | |
fi | |
done | |
pids=("${pids[@]}") # Expunge nulls created by unset. | |
sleep 1 | |
done | |
echo "---All Pages Done!" | |
} | |
addProcess() { | |
############################################################ | |
# remembers process pids of main process | |
############################################################ | |
# usage: | |
# addProcess <filename> $! | |
############################################################ | |
# return: | |
# <filename> (processed file or deleted file) | |
############################################################ | |
x=${1%.*} | |
pid=$2 | |
pids=(${pids[@]} $pid) | |
echo " PAGE: $x (PID $pid)" | |
############################################################ | |
} | |
process() { | |
############################################################ | |
# parallelizable Process for working on each page | |
############################################################ | |
# usage: | |
# process <filename> | |
############################################################ | |
# return: | |
# <filename> (processed file or deleted file) | |
############################################################ | |
x=${1%.*} | |
############################################################ | |
echo " PAGE: $x - BEGIN" | |
cp $x.tif ${x}_BEGIN.tif | |
echo " PAGE: $x - CROP" | |
optimize_crop $x.tif | |
if [ ! -f $x.tif ]; then | |
continue | |
fi | |
cp $x.tif ${x}_CROPPED.tif | |
echo " PAGE: $x - COLOR CHECK" | |
optimize_color $x.tif | |
cp $x.tif ${x}_COLOR_OPTIMIZED.tif | |
echo " PAGE: $x - BLANK CHECK" | |
correct_blank_page $x.tif | |
if [ ! -f $x.tif ]; then | |
continue | |
fi | |
cp $x.tif ${x}_NOT_BLANK.tif | |
echo " PAGE: $x - ROTATION CHECK" | |
correct_orientation $x.tif | |
cp $x.tif ${x}_CORRECT_ORIENTATION.tif | |
echo " PAGE: $x - CREATE PDF PAGE and reorientate" $orientation | |
if [ -f $x".tif" ]; then | |
#tiff2pdf -o "final_$x.pdf" -z -u m -p "A4" -F $x".tif" | |
## Create PDF with fit to A4 - even in landscape mode - does not work | |
#convert -compress Group4 -density 300 -define pdf:fit-page=A4 $x".tif" $x"_single.pdf" | |
#convert $x".tif" -resize 595x823^> -gravity center -background white $x"_singles.pdf" | |
# convert file to A4 PDF | |
tiff2pdf -p a4 -z -u m -t "Scan-$DATETIME" -f -o $x".pdf" $x".tif" | |
if [ "$orientation" = "90" ]; then | |
# rotate pdf +90 | |
pdftk $x".pdf" cat 1east output $x"_o.pdf" | |
rm $x".pdf" | |
mv $x"_o.pdf" $x".pdf" | |
elif [ "$orientation" = "270" ]; then | |
# was rotated 180 - now add 90 | |
pdftk $x".pdf" cat 1east output $x"_o.pdf" | |
rm $x".pdf" | |
mv $x"_o.pdf" $x".pdf" | |
fi | |
fi | |
#echo "---PAGE: $i -PDFTK-----" | |
#echo "---PAGE: $i -FLATTEN---" | |
#pdftk tiff2pdf_$x.pdf cat output pdftk_$x.pdf flatten | |
#pdftk pdftk_$x.pdf dump_data > pdftk_$x.info | |
#echo "---PAGE: $i -NORMALIZE-" | |
#convert -normalize -density 300 -depth 8 pdftk_$x.pdf $x.png | |
# echo "---PAGE: $i -TESSERACT get info-" | |
# #FOR: correct_orientation(): tesseract -psm 0 -l deu+eng $x.png result_$x 1>tesseract_$x.info 2>&1 | |
# tesseract -psm 1 -l deu+eng $x.png result_$x pdf quiet 1>/dev/null 2>&1 | |
# echo "---PAGE: $i -METADATA--" | |
# pdftk result_$x.pdf dump_data > pdftk_$x.info2 | |
# pdftk result_$x.pdf update_info pdftk_$x.info output final_$x.pdf | |
echo "---PAGE: $i -END------" | |
} | |
############################################################ | |
# MAIN | |
############################################################ | |
echo "################### Preprocessing ####################" | |
i=1 | |
for page in $(ls -v *.tif); do | |
# Create x as number with 4 digits counting up | |
x=`printf "%04d" $i` | |
cp $page $x.tif | |
# Execute parallel worker for each scanned page | |
process $x.tif & | |
addProcess $x.tif $! | |
# Next page | |
i=$(expr $i + 1) | |
done | |
# Wait until all pages are done | |
waitProcessing | |
echo "############ Combine all pdf to one ###########" | |
pdftk *.pdf cat output $outname | |
echo "############ OCR complete pdf #################" | |
## ocrmypdf mit -l eng+deu ..dann klappts auch mit den Umlauten | |
ocrmypdf $outname $startdir/$outname -l eng+deu | |
cp $startdir/$outname /home/ra/temp | |
#paperwork-shell import $startdir/$outname | |
#ranger --selectfile=$startdir/$outname | |
ranger --selectfile=/home/ra/temp/$outname | |
echo "################ Cleaning Up ##################" | |
cd .. | |
rm -rf $tmpdir | |
cd $startdir | |
# REMINDER for BARCODE FUNCTIONALITY | |
#convert -density 150 "$i[0]" -quality 100 -sharpen 0x1.0 "$i.jpg" # JPG erzeugen um möglichen Barcode zu suchen | |
#barcode=`zbarimg -q --raw "$i.jpg"` # Barcode suchen und in Variable speichern | |
#rm "$i.jpg" # Bild wieder löschen |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment