# 2025.01.14 - [uwabaki] Add uwabaki.party as download host # 2025.01.14 - [fileblade] Additional retries and handling for blocked Tor ips (until alternative) # 2025.01.13 - [ocr_captcha] Create imagemagick OCR function for testing without tesseract # 2025.01.13 - [anonfile, dailyuploads] Update ocr call to use tesseract function # 2025.01.13 - [up_anonfile] Modify to use new upload url # 2025.01.12 - [ateasystems] Update 404 Not found response # 2025.01.11 - [mad] Update direct head response handling # 2025.01.11 - [ranoz] Add 404 Not found handling on head # 2025.01.09 - [ranoz] Add handling of "NEXT_NOT_FOUND" response # 2025.01.09 - [fileblade] Fix cdn url parsing # 2025.01.08 - [up_pixeldrain] Fix success response from pixeldrain # 2025.01.08 - [ramsgaard / up_ramsgaard] Add data.ramsgaard.me as upload / download host # 2025.01.08 - [euromussels / up_euromussels] Add uploads.euromussels.eu as upload / download host # 2025.01.07 - [up_fileland] Add fileland.io as upload host # 2025.01.07 - [up_fireget] Add fireget.com as upload host # 2025.01.06 - [uploadhive] Update the removed / gone response detection # 2025.01.06 - [fileblade] Add "user does not allow free downloads over 100MB" response (and warnings) # 2025.01.06 - [desiupload] Add desiupload as download host # 2025.01.05 - [isupload] Fix filename detection
184 lines
9.5 KiB
Bash
184 lines
9.5 KiB
Bash
#! Name: ocr_captcha.sh
|
|
#! Author: kittykat
|
|
#! Version: 2025.01.14
|
|
#! Desc: Script to extract captcha from image using tesseract-ocr and imagemagick
|
|
#! Usage: Edit LoadPlugin="" line in mad.sh or mad.config
|
|
#! LoadPlugin="ocr_captcha.sh"
|
|
#!
|
|
#! Dependencies
|
|
#! * imagemagick: resize / enhance / convert image
|
|
#! (sudo apt-get install imagemagick)
|
|
#! * tesseract-ocr: Extract text from image
|
|
#! (sudo apt-get install tesseract-ocr)
|
|
#! - The eng_best.traineddata is included in ./plugins/ocr/tessdata/ folder (15,400,601 bytes)
|
|
#! - It can otherwise be downloaded and placed in the folder from:
|
|
#! https://github.com/tesseract-ocr/tessdata_best/raw/main/eng.traineddata
|
|
#! (SHA256: 8280AED0782FE27257A68EA10FE7EF324CA0F8D85BD2FD145D1C2B560BCB66BA)
|
|
#!
|
|
#! Notes:
|
|
#! * Return 0 (true), successfully processed image
|
|
#! * Return 1 (false), failed to process image
|
|
#!
|
|
#! Available Hook Functions:
|
|
#! -------------------------
|
|
#! * OnLoad(): Occurs after load mad.config / load plugins (prior to processing).
|
|
#! * BeginProcessing(): Occurs immediately after beginning processin of urls.txt (loops with Catnaps).
|
|
#! * PreProcessUrl(): occurs immediately after reading in an unprocessed url (^http) line to process.
|
|
#! * PostSuccessfulDownload(): occurs after a download success (is marked #OK# in the urls.txt).
|
|
#! * PostFailedDownload(): occurs after a download fails (is marked #FAIL# in the urls.txt).
|
|
#! * PostFailRetryDownload(): occurs after a download fails with a retry (is marked #RETRY# in the urls.txt).
|
|
#! * DoneProcessingAllUrls(): occurs after all the urls have finished processing (no flocks or other terms downloading).
|
|
#! * PostSuccessfulUpload(): occurs after an upload success (after upload completed ticket is created in ./downloads/).
|
|
#! * PostFailedUpload(): occurs after an upload fails definitively -- #FAIL# in the temp_upload_handler.txt
|
|
#! * PostFailRetryUpload(): occurs after an upload fails with a retry (network drop, unexpected result)
|
|
#! * DoneProcessingAllUploads: occurs after all the files have finished processing
|
|
#!
|
|
#!
|
|
#! CaptchaOcrImage: Uses imagemagick only to alter 4 digit horizontal captchas (WIP)
|
|
CaptchaOcrImage() {
|
|
local plugName='ocr_captcha'
|
|
local plugFunc='CaptchaOcrImage'
|
|
if [ "${DebugPluginsEnabled}" == "true" ]; then
|
|
echo -e "[${PINK}DEBUG${NC}]: Running ${PINK}$plugFunc${NC} in ${BLUE}$plugName${NC} ...${NC}"
|
|
fi
|
|
DEPENDENCIES=(convert)
|
|
for DEPENDENCY in ${DEPENDENCIES[@]} ; do
|
|
if [ -z $(which $DEPENDENCY) ] ; then
|
|
if [ "$DEPENDENCY" == "convert" ]; then
|
|
echo "imagemagick not installed. Aborting"
|
|
else
|
|
echo "$DEPENDENCY not installed. Aborting"
|
|
fi
|
|
return 1
|
|
fi
|
|
done
|
|
captcha_image_filepath="$1"
|
|
data_type="$2"
|
|
imagemagick_extra_params="$3"
|
|
local captcha_image_filename="${captcha_image_filepath##*/}"
|
|
if [ ! -f "$captcha_image_filepath" ]; then
|
|
echo -e "Image not found."
|
|
return 1
|
|
fi
|
|
local digitschars='"data:image/webp;base64,'$(base64 -w 0 $captcha_image_filepath)'"'
|
|
if grep -Eqi "NUMBERONLY" <<< "$data_type" ; then
|
|
local i e r
|
|
for i in {0..3}; do
|
|
e=$(compare -metric NCC -subimage-search $digitschars \( "$1" -crop 8x10+$((22+9*i))+8 \) null: 2>&1)
|
|
[[ $e =~ @\ ([0-9]+) ]] && r+=$((1+BASH_REMATCH[1]/8))
|
|
done
|
|
echo "$r" > "$WorkDir/.temp/ocr_final.txt"
|
|
captcha="$r"
|
|
elif grep -Eqi "ALPHAONLY" <<< "$data_type" ; then
|
|
local i e r
|
|
for i in {0..3}; do
|
|
e=$(compare -metric NCC -subimage-search $digitschars \( "$1" -crop 8x10+$((22+9*i))+8 \) null: 2>&1)
|
|
[[ $e =~ @\ ([a-zA-Z]+) ]] && r+=$((1+BASH_REMATCH[1]/8))
|
|
done
|
|
echo "$r" > "$WorkDir/.temp/ocr_final.txt"
|
|
captcha="$r"
|
|
else
|
|
local i e r
|
|
for i in {0..3}; do
|
|
e=$(compare -metric NCC -subimage-search $digitschars \( "$1" -crop 8x10+$((22+9*i))+8 \) null: 2>&1)
|
|
[[ $e =~ @\ ([0-9a-zA-Z]+) ]] && r+=$((1+BASH_REMATCH[1]/8))
|
|
done
|
|
echo "$r" > "$WorkDir/.temp/ocr_final.txt"
|
|
captcha="$r"
|
|
fi
|
|
echo -e "[CAPTCHA_CODE:${captcha}]"
|
|
}
|
|
#!
|
|
#! CaptchaOcrImageTesseract: Uses imagemagick to alter, and Tesseract OCR to process captchas
|
|
CaptchaOcrImageTesseract() {
|
|
local plugName='ocr_captcha'
|
|
local plugFunc='CaptchaOcrImageTesseract'
|
|
if [ "${DebugPluginsEnabled}" == "true" ]; then
|
|
echo -e "[${PINK}DEBUG${NC}]: Running ${PINK}$plugFunc${NC} in ${BLUE}$plugName${NC} ...${NC}"
|
|
fi
|
|
DEPENDENCIES=(tesseract convert)
|
|
for DEPENDENCY in ${DEPENDENCIES[@]} ; do
|
|
if [ -z $(which $DEPENDENCY) ] ; then
|
|
if [ "$DEPENDENCY" == "convert" ]; then
|
|
echo "imagemagick not installed. Aborting"
|
|
else
|
|
echo "$DEPENDENCY not installed. Aborting"
|
|
fi
|
|
return 1
|
|
fi
|
|
done
|
|
TESSERACT_CMD=$(which tesseract)
|
|
export TESSDATA_PREFIX="${ScriptDir}/plugins/ocr/tessdata"
|
|
captcha_image_filepath="$1"
|
|
data_type="$2"
|
|
imagemagick_extra_params="$3"
|
|
local captcha_image_filename="${captcha_image_filepath##*/}"
|
|
if [ ! -f "$captcha_image_filepath" ]; then
|
|
echo -e "Image not found."
|
|
return 1
|
|
fi
|
|
mkdir -p "$WorkDir/.temp"
|
|
IMGtemp="$WorkDir/.temp/`date +%y%m%d-%H%M%S`_$captcha_image_filename.tif"
|
|
convert "$captcha_image_filepath" -units PixelsPerInch -respect-parenthesis \( -compress LZW -resample 900 -density 300 -bordercolor black -border 1 -trim +repage -fill white -draw "color 0,0 floodfill" -alpha off -shave 1x2 \) \( -bordercolor black -border 2 -fill white -draw "color 0,0 floodfill" -alpha off -shave 0x1 +repage \) -antialias -sharpen 0x3 "$IMGtemp"
|
|
if grep -Eqi "Deskew" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -deskew 10 "IMGtemp"
|
|
fi
|
|
if grep -Eqi "ContrastStretch_1x60" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 1%x60% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_1x70" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 1%x70% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_1x75" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 1%x75% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_1x80" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 1%x80% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_1x85" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 1%x85% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_1x90" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 1%x90% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_1x95" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 1%x95% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_5x80" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 5%x80% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_5x85" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 5%x85% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_5x90" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 5%x90% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_5x95" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 5%x95% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_10x90" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 10%x90% "$IMGtemp"
|
|
elif grep -Eqi "ContrastStretch_10x95" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -contrast-stretch 10%x95% "$IMGtemp"
|
|
fi
|
|
if grep -Eqi "Brightness_120" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -modulate 120 "$IMGtemp"
|
|
elif grep -Eqi "Brightness_130" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -modulate 130 "$IMGtemp"
|
|
elif grep -Eqi "Brightness_135" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -modulate 135 "$IMGtemp"
|
|
elif grep -Eqi "Brightness_140" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -modulate 150 "$IMGtemp"
|
|
elif grep -Eqi "Brightness_145" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -modulate 145 "$IMGtemp"
|
|
elif grep -Eqi "Brightness_150" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -modulate 150 "$IMGtemp"
|
|
elif grep -Eqi "Brightness_155" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -modulate 155 "$IMGtemp"
|
|
elif grep -Eqi "Brightness_160" <<< "$imagemagick_extra_params" ; then
|
|
convert "$IMGtemp" -modulate 160 "$IMGtemp"
|
|
fi
|
|
if grep -Eqi "NUMBERONLY" <<< "$data_type" ; then
|
|
captcha=$($TESSERACT_CMD --psm 8 --oem 1 -l eng_best --dpi 70 -c tessedit_char_whitelist=0123456789 "$IMGtemp" stdout | tr -d " " | xargs)
|
|
captcha=${captcha//[!0-9]/}
|
|
elif grep -Eqi "ALPHAONLY" <<< "$data_type" ; then
|
|
captcha=$($TESSERACT_CMD --psm 8 --oem 1 -l eng_best --dpi 70 -c tessedit_char_whitelist=abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ "$IMGtemp" stdout | tr -d " " | xargs)
|
|
captcha=${captcha//[!a-zA-Z]/}
|
|
elif grep -Eqi "ALPHANUMERIC" <<< "$data_type" ; then
|
|
captcha=$($TESSERACT_CMD --psm 8 --oem 1 -l eng_best --dpi 70 -c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ "$IMGtemp" stdout | tr -d " " | xargs)
|
|
captcha=${captcha//[!0-9a-zA-Z]/}
|
|
else
|
|
captcha=$($TESSERACT_CMD --psm 8 --oem 1 -l eng_best --dpi 70 "$IMGtemp" stdout | tr -d " " | xargs)
|
|
fi
|
|
rm -f "$IMGtemp"
|
|
echo -e "[CAPTCHA_CODE:${captcha}]"
|
|
}
|