mad/plugins/ocr_captcha.sh
kittykat d62376f7a8
# 2025.02.18 - [uploadhive] Add handling of the new /cgi-bin/dl.cgi/ url tickets (WIP)
#               (unfortunately, this is tied to the requesting ip, so downloads get "Wrong IP")
# 2025.02.18 - [up_oshi] Add Manage url as comment on uploads
# 2025.02.18 - [up_oshi / oshi] use /nossl/ url and http
# 2025.02.17 - [gofile] Add a random sleep if 429 response detected (too many requests)
# 2025.02.17 - [*ALL] Audit and update all single bracket operations
# 2025.02.17 - [filehaus] Fix downloading from fh
# 2025.02.15 - [uploadbay] Update urls regex for acceptable alternate
# 2025.02.15 - [up_sendnow] Add send.now as upload host
# 2025.02.15 - [sendnow] Fix handling of filenames with special characters in url
2025-02-19 13:41:07 +00:00

184 lines
9.5 KiB
Bash

#! Name: ocr_captcha.sh
#! Author: kittykat
#! Version: 2025.02.18
#! Desc: Script to extract captcha from image using tesseract-ocr and imagemagick
#! Usage: Edit LoadPlugin="" line in mad.sh or mad.config
#! LoadPlugin="ocr_captcha.sh"
#!
#! Dependencies
#! * imagemagick: resize / enhance / convert image
#! (sudo apt-get install imagemagick)
#! * tesseract-ocr: Extract text from image
#! (sudo apt-get install tesseract-ocr)
#! - The eng_best.traineddata is included in ./plugins/ocr/tessdata/ folder (15,400,601 bytes)
#! - It can otherwise be downloaded and placed in the folder from:
#! https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
#! (SHA256: 8280AED0782FE27257A68EA10FE7EF324CA0F8D85BD2FD145D1C2B560BCB66BA)
#!
#! Notes:
#! * Return 0 (true), successfully processed image
#! * Return 1 (false), failed to process image
#!
#! Available Hook Functions:
#! -------------------------
#! * OnLoad(): Occurs after load mad.config / load plugins (prior to processing).
#! * BeginProcessing(): Occurs immediately after beginning processin of urls.txt (loops with Catnaps).
#! * PreProcessUrl(): occurs immediately after reading in an unprocessed url (^http) line to process.
#! * PostSuccessfulDownload(): occurs after a download success (is marked #OK# in the urls.txt).
#! * PostFailedDownload(): occurs after a download fails (is marked #FAIL# in the urls.txt).
#! * PostFailRetryDownload(): occurs after a download fails with a retry (is marked #RETRY# in the urls.txt).
#! * DoneProcessingAllUrls(): occurs after all the urls have finished processing (no flocks or other terms downloading).
#! * PostSuccessfulUpload(): occurs after an upload success (after upload completed ticket is created in ./downloads/).
#! * PostFailedUpload(): occurs after an upload fails definitively -- #FAIL# in the temp_upload_handler.txt
#! * PostFailRetryUpload(): occurs after an upload fails with a retry (network drop, unexpected result)
#! * DoneProcessingAllUploads: occurs after all the files have finished processing
#!
#!
#! CaptchaOcrImage: Uses imagemagick only to alter 4 digit horizontal captchas (WIP)
CaptchaOcrImage() {
local plugName='ocr_captcha'
local plugFunc='CaptchaOcrImage'
if [[ "${DebugPluginsEnabled}" == "true" ]]; then
echo -e "[${PINK}DEBUG${NC}]: Running ${PINK}$plugFunc${NC} in ${BLUE}$plugName${NC} ...${NC}"
fi
DEPENDENCIES=(convert)
for DEPENDENCY in ${DEPENDENCIES[@]} ; do
if [[ -z $(which $DEPENDENCY) ]] ; then
if [[ "$DEPENDENCY" == "convert" ]]; then
echo "imagemagick not installed. Aborting"
else
echo "$DEPENDENCY not installed. Aborting"
fi
return 1
fi
done
captcha_image_filepath="$1"
data_type="$2"
imagemagick_extra_params="$3"
local captcha_image_filename="${captcha_image_filepath##*/}"
if [[ ! -f "$captcha_image_filepath" ]]; then
echo -e "Image not found."
return 1
fi
local digitschars='"data:image/webp;base64,'$(base64 -w 0 $captcha_image_filepath)'"'
if grep -Eqi "NUMBERONLY" <<< "$data_type" ; then
local i e r
for i in {0..3}; do
e=$(compare -metric NCC -subimage-search $digitschars \( "$1" -crop 8x10+$((22+9*i))+8 \) null: 2>&1)
[[ $e =~ @\ ([0-9]+) ]] && r+=$((1+BASH_REMATCH[1]/8))
done
echo "$r" > "$WorkDir/.temp/ocr_final.txt"
captcha="$r"
elif grep -Eqi "ALPHAONLY" <<< "$data_type" ; then
local i e r
for i in {0..3}; do
e=$(compare -metric NCC -subimage-search $digitschars \( "$1" -crop 8x10+$((22+9*i))+8 \) null: 2>&1)
[[ $e =~ @\ ([a-zA-Z]+) ]] && r+=$((1+BASH_REMATCH[1]/8))
done
echo "$r" > "$WorkDir/.temp/ocr_final.txt"
captcha="$r"
else
local i e r
for i in {0..3}; do
e=$(compare -metric NCC -subimage-search $digitschars \( "$1" -crop 8x10+$((22+9*i))+8 \) null: 2>&1)
[[ $e =~ @\ ([0-9a-zA-Z]+) ]] && r+=$((1+BASH_REMATCH[1]/8))
done
echo "$r" > "$WorkDir/.temp/ocr_final.txt"
captcha="$r"
fi
echo -e "[CAPTCHA_CODE:${captcha}]"
}
#!
#! CaptchaOcrImageTesseract: Uses imagemagick to alter, and Tesseract OCR to process captchas
CaptchaOcrImageTesseract() {
local plugName='ocr_captcha'
local plugFunc='CaptchaOcrImageTesseract'
if [[ "${DebugPluginsEnabled}" == "true" ]]; then
echo -e "[${PINK}DEBUG${NC}]: Running ${PINK}$plugFunc${NC} in ${BLUE}$plugName${NC} ...${NC}"
fi
DEPENDENCIES=(tesseract convert)
for DEPENDENCY in ${DEPENDENCIES[@]} ; do
if [ -z $(which $DEPENDENCY) ] ; then
if [[ "$DEPENDENCY" == "convert" ]]; then
echo "imagemagick not installed. Aborting"
else
echo "$DEPENDENCY not installed. Aborting"
fi
return 1
fi
done
TESSERACT_CMD=$(which tesseract)
export TESSDATA_PREFIX="${ScriptDir}/plugins/ocr/tessdata"
captcha_image_filepath="$1"
data_type="$2"
imagemagick_extra_params="$3"
local captcha_image_filename="${captcha_image_filepath##*/}"
if [[ ! -f "$captcha_image_filepath" ]]; then
echo -e "Image not found."
return 1
fi
mkdir -p "$WorkDir/.temp"
IMGtemp="$WorkDir/.temp/`date +%y%m%d-%H%M%S`_$captcha_image_filename.tif"
convert "$captcha_image_filepath" -units PixelsPerInch -respect-parenthesis \( -compress LZW -resample 900 -density 300 -bordercolor black -border 1 -trim +repage -fill white -draw "color 0,0 floodfill" -alpha off -shave 1x2 \) \( -bordercolor black -border 2 -fill white -draw "color 0,0 floodfill" -alpha off -shave 0x1 +repage \) -antialias -sharpen 0x3 "$IMGtemp"
if grep -Eqi "Deskew" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -deskew 10 "IMGtemp"
fi
if grep -Eqi "ContrastStretch_1x60" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 1%x60% "$IMGtemp"
elif grep -Eqi "ContrastStretch_1x70" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 1%x70% "$IMGtemp"
elif grep -Eqi "ContrastStretch_1x75" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 1%x75% "$IMGtemp"
elif grep -Eqi "ContrastStretch_1x80" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 1%x80% "$IMGtemp"
elif grep -Eqi "ContrastStretch_1x85" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 1%x85% "$IMGtemp"
elif grep -Eqi "ContrastStretch_1x90" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 1%x90% "$IMGtemp"
elif grep -Eqi "ContrastStretch_1x95" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 1%x95% "$IMGtemp"
elif grep -Eqi "ContrastStretch_5x80" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 5%x80% "$IMGtemp"
elif grep -Eqi "ContrastStretch_5x85" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 5%x85% "$IMGtemp"
elif grep -Eqi "ContrastStretch_5x90" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 5%x90% "$IMGtemp"
elif grep -Eqi "ContrastStretch_5x95" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 5%x95% "$IMGtemp"
elif grep -Eqi "ContrastStretch_10x90" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 10%x90% "$IMGtemp"
elif grep -Eqi "ContrastStretch_10x95" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -contrast-stretch 10%x95% "$IMGtemp"
fi
if grep -Eqi "Brightness_120" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -modulate 120 "$IMGtemp"
elif grep -Eqi "Brightness_130" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -modulate 130 "$IMGtemp"
elif grep -Eqi "Brightness_135" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -modulate 135 "$IMGtemp"
elif grep -Eqi "Brightness_140" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -modulate 150 "$IMGtemp"
elif grep -Eqi "Brightness_145" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -modulate 145 "$IMGtemp"
elif grep -Eqi "Brightness_150" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -modulate 150 "$IMGtemp"
elif grep -Eqi "Brightness_155" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -modulate 155 "$IMGtemp"
elif grep -Eqi "Brightness_160" <<< "$imagemagick_extra_params" ; then
convert "$IMGtemp" -modulate 160 "$IMGtemp"
fi
if grep -Eqi "NUMBERONLY" <<< "$data_type" ; then
captcha=$($TESSERACT_CMD --psm 8 --oem 1 -l eng_best --dpi 70 -c tessedit_char_whitelist=0123456789 "$IMGtemp" stdout | tr -d " " | xargs)
captcha=${captcha//[!0-9]/}
elif grep -Eqi "ALPHAONLY" <<< "$data_type" ; then
captcha=$($TESSERACT_CMD --psm 8 --oem 1 -l eng_best --dpi 70 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ "$IMGtemp" stdout | tr -d " " | xargs)
captcha=${captcha//[!a-zA-Z]/}
elif grep -Eqi "ALPHANUMERIC" <<< "$data_type" ; then
captcha=$($TESSERACT_CMD --psm 8 --oem 1 -l eng_best --dpi 70 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ "$IMGtemp" stdout | tr -d " " | xargs)
captcha=${captcha//[!0-9a-zA-Z]/}
else
captcha=$($TESSERACT_CMD --psm 8 --oem 1 -l eng_best --dpi 70 "$IMGtemp" stdout | tr -d " " | xargs)
fi
rm -f "$IMGtemp"
echo -e "[CAPTCHA_CODE:${captcha}]"
}