| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126 | <#Automate Tesseract 3.02 language data pack generation process.@author: Quan Nguyen@date: 16 April 2013The script file should be placed in the same directory as Tesseract's binary executables.All training data files must be prefixed with the language code -- such as: vie.arial.exp0.tif, vie.font_properties, vie.unicharambigs, vie.frequent_words_list, vie.words_list-- and placed in a trainfolder directory, which could be placed directly under Tesseract directory.http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3Run PowerShell as Administrator and allow script execution by running the following command:PS > Set-ExecutionPolicy RemoteSignedThen execute the script by:PS > .\train.ps1orPS > .\train.ps1 yourlang trainfolderWindows PowerShell 2.0 Download: http://support.microsoft.com/kb/968929#>if ($args[0] -and ($args[0] -eq "-?" -or $args[0] -eq "-h" -or $args[0] -eq "-help")) {    Write-Host "Usage: .\train.ps1"    Write-Host "   or  .\train.ps1 trainfolder yourlang [bootstraplang]"    Write-Host "where trainfolder directory contains all the training data files prefixed with yourlang, e.g.,"    Write-Host "vie.arial.exp0.tif, vie.font_properties, vie.unicharambigs, vie.frequent_words_list, vie.words_list,"    Write-Host "and could be placed directly under Tesseract directory"    exit}$trainDir = $args[0]if (!$trainDir) {    $trainDir = Read-Host "Enter location of the training data folder"}$lang = $args[1]if (!$lang) {    $lang = Read-Host "Enter a language code"}if ($lang -eq "" -or $trainDir -eq "") {     Write-Host "Invalid input"     exit}if (!(test-path $trainDir)) {    throw "{0} is not a valid path" -f $trainDir    exit}$bootstraplang = $args[2]if (!$bootstraplang) {    $bootstraplang = Read-Host "Enter a bootstrap language code (optional)"}echo "=== Generating Tesseract language data for language: $lang ==="$fullPath = Resolve-Path $trainDirecho "** Your training images should be in ""$fullPath"" directory."$al = New-Object System.Collections.ArrayListecho "Make Box Files"$boxFiles = ""Foreach ($entry in dir $trainDir) {   If ($entry.name.toLower().endsWith(".tif") -and $entry.name.startsWith($lang)) {      echo "** Processing image: $entry"      $nameWoExt = [IO.Path]::Combine($trainDir, $entry.BaseName)      $al.Add($nameWoExt)      If ($bootstraplang -eq "") {        $trainCmd = ".\tesseract {0}.tif {0} batch.nochop makebox" -f $nameWoExt      } else {#Bootstrapping a new character set        $trainCmd = ".\tesseract {0}.tif {0} -l {1} batch.nochop makebox" -f $nameWoExt, $bootstraplang      }     #Should comment out the next line after done with editing the box files to prevent them from getting overwritten in repeated runs.      Invoke-Expression $trainCmd      $boxFiles += $nameWoExt + ".box "   }}echo "** Box files should be edited before continuing. **"echo "Generate .tr Files"$trFiles = ""Foreach ($entry in $al) {      $trainCmd = ".\tesseract {0}.tif {0} nobatch box.train" -f $entry      Invoke-Expression $trainCmd      $trFiles += $entry + ".tr "}echo "Compute the Character Set"Invoke-Expression ".\unicharset_extractor -D $trainDir $boxFiles"echo "Clustering"Invoke-Expression ".\shapeclustering -F $trainDir\$lang.font_properties -U $trainDir\unicharset $trFiles"Invoke-Expression ".\mftraining -F $trainDir\$lang.font_properties -U $trainDir\unicharset -O $trainDir\$lang.unicharset $trFiles"move-item -force -path inttemp -destination $trainDir\$lang.inttempmove-item -force -path pffmtable -destination $trainDir\$lang.pffmtable#move-item -force -path Microfeat -destination $trainDir\$lang.MicrofeatInvoke-Expression ".\cntraining $trFiles"move-item -force -path normproto -destination $trainDir\$lang.normprotomove-item -force -path shapetable -destination $trainDir\$lang.shapetableecho "Dictionary Data"Invoke-Expression ".\wordlist2dawg $trainDir\$lang.frequent_words_list $trainDir\$lang.freq-dawg $trainDir\$lang.unicharset"Invoke-Expression ".\wordlist2dawg $trainDir\$lang.words_list $trainDir\$lang.word-dawg $trainDir\$lang.unicharset"#Invoke-Expression ".\wordlist2dawg $trainDir\$lang.wordlist-punc $trainDir\$lang.punc-dawg $trainDir\$lang.unicharset"#Invoke-Expression ".\wordlist2dawg $trainDir\$lang.wordlist-number $trainDir\$lang.number-dawg $trainDir\$lang.unicharset"echo "The last file (unicharambigs) -- this is to be manually edited"if (!(test-path $trainDir\$lang.unicharambigs)) {    new-item "$trainDir\$lang.unicharambigs" -type file    set-content -path $trainDir\$lang.unicharambigs -value "v1"}echo "Putting it all together"Invoke-Expression ".\combine_tessdata $trainDir\$lang."
 |