123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 |
- <#
- Automate Tesseract 3.02 language data pack generation process.
- @author: Quan Nguyen
- @date: 16 April 2013
- The script file should be placed in the same directory as Tesseract's binary executables.
- All training data files must be prefixed with the language code -- such as:
- vie.arial.exp0.tif, vie.font_properties, vie.unicharambigs, vie.frequent_words_list, vie.words_list
- -- and placed in a trainfolder directory, which could be placed directly under Tesseract directory.
- http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3
- Run PowerShell as Administrator and allow script execution by running the following command:
- PS > Set-ExecutionPolicy RemoteSigned
- Then execute the script by:
- PS > .\train.ps1
- or
- PS > .\train.ps1 yourlang trainfolder
- Windows PowerShell 2.0 Download: http://support.microsoft.com/kb/968929
- #>
- if ($args[0] -and ($args[0] -eq "-?" -or $args[0] -eq "-h" -or $args[0] -eq "-help")) {
- Write-Host "Usage: .\train.ps1"
- Write-Host " or .\train.ps1 trainfolder yourlang [bootstraplang]"
- Write-Host "where trainfolder directory contains all the training data files prefixed with yourlang, e.g.,"
- Write-Host "vie.arial.exp0.tif, vie.font_properties, vie.unicharambigs, vie.frequent_words_list, vie.words_list,"
- Write-Host "and could be placed directly under Tesseract directory"
- exit
- }
- $trainDir = $args[0]
- if (!$trainDir) {
- $trainDir = Read-Host "Enter location of the training data folder"
- }
- $lang = $args[1]
- if (!$lang) {
- $lang = Read-Host "Enter a language code"
- }
- if ($lang -eq "" -or $trainDir -eq "") {
- Write-Host "Invalid input"
- exit
- }
- if (!(test-path $trainDir)) {
- throw "{0} is not a valid path" -f $trainDir
- exit
- }
- $bootstraplang = $args[2]
- if (!$bootstraplang) {
- $bootstraplang = Read-Host "Enter a bootstrap language code (optional)"
- }
- echo "=== Generating Tesseract language data for language: $lang ==="
- $fullPath = Resolve-Path $trainDir
- echo "** Your training images should be in ""$fullPath"" directory."
- $al = New-Object System.Collections.ArrayList
- echo "Make Box Files"
- $boxFiles = ""
- Foreach ($entry in dir $trainDir) {
- If ($entry.name.toLower().endsWith(".tif") -and $entry.name.startsWith($lang)) {
- echo "** Processing image: $entry"
- $nameWoExt = [IO.Path]::Combine($trainDir, $entry.BaseName)
- $al.Add($nameWoExt)
- If ($bootstraplang -eq "") {
- $trainCmd = ".\tesseract {0}.tif {0} batch.nochop makebox" -f $nameWoExt
- } else {
- #Bootstrapping a new character set
- $trainCmd = ".\tesseract {0}.tif {0} -l {1} batch.nochop makebox" -f $nameWoExt, $bootstraplang
- }
-
- #Should comment out the next line after done with editing the box files to prevent them from getting overwritten in repeated runs.
- Invoke-Expression $trainCmd
- $boxFiles += $nameWoExt + ".box "
- }
- }
- echo "** Box files should be edited before continuing. **"
- echo "Generate .tr Files"
- $trFiles = ""
- Foreach ($entry in $al) {
- $trainCmd = ".\tesseract {0}.tif {0} nobatch box.train" -f $entry
- Invoke-Expression $trainCmd
- $trFiles += $entry + ".tr "
- }
- echo "Compute the Character Set"
- Invoke-Expression ".\unicharset_extractor -D $trainDir $boxFiles"
- echo "Clustering"
- Invoke-Expression ".\shapeclustering -F $trainDir\$lang.font_properties -U $trainDir\unicharset $trFiles"
- Invoke-Expression ".\mftraining -F $trainDir\$lang.font_properties -U $trainDir\unicharset -O $trainDir\$lang.unicharset $trFiles"
- move-item -force -path inttemp -destination $trainDir\$lang.inttemp
- move-item -force -path pffmtable -destination $trainDir\$lang.pffmtable
- #move-item -force -path Microfeat -destination $trainDir\$lang.Microfeat
- Invoke-Expression ".\cntraining $trFiles"
- move-item -force -path normproto -destination $trainDir\$lang.normproto
- move-item -force -path shapetable -destination $trainDir\$lang.shapetable
- echo "Dictionary Data"
- Invoke-Expression ".\wordlist2dawg $trainDir\$lang.frequent_words_list $trainDir\$lang.freq-dawg $trainDir\$lang.unicharset"
- Invoke-Expression ".\wordlist2dawg $trainDir\$lang.words_list $trainDir\$lang.word-dawg $trainDir\$lang.unicharset"
- #Invoke-Expression ".\wordlist2dawg $trainDir\$lang.wordlist-punc $trainDir\$lang.punc-dawg $trainDir\$lang.unicharset"
- #Invoke-Expression ".\wordlist2dawg $trainDir\$lang.wordlist-number $trainDir\$lang.number-dawg $trainDir\$lang.unicharset"
- echo "The last file (unicharambigs) -- this is to be manually edited"
- if (!(test-path $trainDir\$lang.unicharambigs)) {
- new-item "$trainDir\$lang.unicharambigs" -type file
- set-content -path $trainDir\$lang.unicharambigs -value "v1"
- }
- echo "Putting it all together"
- Invoke-Expression ".\combine_tessdata $trainDir\$lang."
|