(view source code of pages2txt.bat as plain text)
@ECHO OFF
:: Check command line
IF "%~1"=="" GOTO Syntax
IF NOT "%~3"=="" GOTO Syntax
ECHO.%* | FIND "?" >NUL && GOTO Syntax
IF /I NOT "%~x1"==".pages" GOTO Syntax
IF NOT EXIST "%~1" (
ECHO ←[1;31mFile not found: "%~1"←[0m
GOTO Syntax
)
:: Make %TEMP% the working directory
PUSHD "%TEMP%"
:: Check if files already exist
IF EXIST preview.jpg (
ECHO ←[1;33mFile preview.jpg already exists.
CHOICE.EXE /D N /T 10 /M "Do you want to delete it?←[0;30m"
IF ERRORLEVEL 1 IF NOT ERRORLEVEL 2 (
ECHO ←[0m
DEL preview.jpg
) ELSE (
ECHO ←[1;33mPlease move or rename preview.jpg and try again.←[0m
POPD
EXIT /B 1
)
)
IF EXIST "%~dpn1.txt" (
ECHO ←[1;33mFile "%~n1.txt" already exists.
CHOICE.EXE /D N /T 10 /M "Do you want to delete it?←[0;30m"
IF ERRORLEVEL 1 IF NOT ERRORLEVEL 2 (
ECHO ←[0m
DEL "%~dpn1.txt"
) ELSE (
ECHO ←[1;33mPlease move or rename "%~n1.txt" and try again.←[0m
POPD
EXIT /B 1
)
)
:: Extract preview.jpg from .pages file
FOR /F "tokens=*" %%A IN ('DIR /AD /B "%ProgramFiles%\7*"') DO (
FOR /F "tokens=*" %%B IN ('DIR /B /S "%ProgramFiles%\%%~A\7z.exe"') DO (
"%%~B" e "%~f1" preview.jpg
)
)
IF NOT EXIST preview.jpg (
ECHO ←[1;33mThis batch file requires 7zip, available at←[0m
ECHO ←[1mhttps://7-zip.org/←[1;33m
CHOICE /D N /T 10 /M "Do you want to download it?←[0;30m"
IF ERRORLEVEL 1 IF NOT ERRORLEVEL 2 (
START "" https://7-zip.org/
)
ECHO ←[0m
POPD
EXIT /B 1
)
:: Perform OCR on extracted preview.jpg and save it with same name as specified input file and .txt extension
FOR /F "tokens=*" %%A IN ('DIR /AD /B "%ProgramFiles%\tesseract*"') DO (
REM Check if language code is specified, and if it is valid
IF NOT "%~2"=="" (
IF NOT EXIST "%ProgramFiles%\%%~A\tessdata\%~2.*data*" (
ECHO ←[1;31mUnsupported Tesseract language code: "%~2"
FOR /F %%B IN ('DIR /B "%ProgramFiles%\%%~A\tessdata\???.*data*" ^| FIND.EXE /C "data"') DO (
IF %%B GTR 1 (
ECHO ←[0mUse one of the following language codes:
FOR %%C IN ("%ProgramFiles%\%%~A\tessdata\???.*data*") DO (
IF /I NOT "%%~nC"=="osd" (
SET /P "=←[1;32m%%~nC←[0m, " < NUL
)
)
SET /P "=or omit the language code to use the default (←[1;32meng←[0m)" < NUL
)
)
ECHO ←[0m
POPD
EXIT /B 1
)
)
FOR /F "tokens=*" %%B IN ('DIR /B /S "%ProgramFiles%\%%~A\tesseract.exe"') DO (
IF "%~2"=="" (
"%%~B" preview.jpg "%~dpn1" -l eng
) ELSE (
"%%~B" preview.jpg "%~dpn1" -l %~2
)
)
)
IF NOT EXIST "%~dpn1.txt" (
ECHO ←[1;33mThis batch file requires Tesseract OCR, available at←[0m
ECHO ←[1mhttps://github.com/UB-Mannheim/tesseract/wiki←[1;33m
CHOICE /D N /T 10 /M "Do you want to download it?←[0;30m"
IF ERRORLEVEL 1 IF NOT ERRORLEVEL 2 (
START "" https://github.com/UB-Mannheim/tesseract/wiki
)
ECHO ←[0m
POPD
EXIT /B 1
)
ECHO ←[1;32mExtracted text successfully saved as "%~dpn1.txt"←[0m
:: Delete temporary file
DEL preview.jpg
:: Open extracted text in Word, if available
IF EXIST "%ProgramFiles%\Microsoft Office\" (
FOR /F "tokens=*" %%A IN ('DIR /B /S "%ProgramFiles%\Microsoft Office\winword.exe"') DO (
START "" "%%~A" /t "%~dpn1.txt"
)
)
:: Restore working directory
POPD
:: Done
EXIT /B 0
:Syntax
ECHO.
ECHO %~nx0, Version 1.00
ECHO Use OCR to extract text from a *.pages document.
ECHO.
ECHO Usage: ←[1;33m%~nx0 file.pages [ languagecode ]←[0m
ECHO.
ECHO Where: ←[1;33mfile.pages←[0m *.pages file from which text is to be extracted
ECHO ←[1;33mlanguagecode←[0m optional Tesseract 3 letter language code (default: eng)
ECHO.
ECHO Notes: This program requires 7-zip as well as Tesseract OCR.
ECHO The extracted text will be saved as plain text in the .pages file's
ECHO parent folder, using the specified file's name, and .txt extension.
ECHO If the specified file name contains multiple dots, the output file
ECHO name will be truncated at the first dot. If the output file already
ECHO exists, you will be prompted to delete it or abort.
ECHO If an invalid language code is specified, the batch file will abort
ECHO after showing a list of available language codes.
ECHO A temporary file preview.jpg will be created. If it already exists,
ECHO you will be prompted to delete it or abort.
ECHO If MS Word is available, the extracted text will be opened in Word.
ECHO The batch file's return code ("Errorlevel") will equal 0 if the
ECHO specified file was successfully converted, otherwise it will equal 1.
ECHO.
ECHO Written by Rob van der Woude
ECHO https://www.robvanderwoude.com
EXIT /B 1
page last modified: 2024-04-16; loaded in 0.0095 seconds