train.ps1 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126
  1. <#
  2. Automate Tesseract 3.02 language data pack generation process.
  3. @author: Quan Nguyen
  4. @date: 16 April 2013
  5. The script file should be placed in the same directory as Tesseract's binary executables.
  6. All training data files must be prefixed with the language code -- such as:
  7. vie.arial.exp0.tif, vie.font_properties, vie.unicharambigs, vie.frequent_words_list, vie.words_list
  8. -- and placed in a trainfolder directory, which could be placed directly under Tesseract directory.
  9. http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3
  10. Run PowerShell as Administrator and allow script execution by running the following command:
  11. PS > Set-ExecutionPolicy RemoteSigned
  12. Then execute the script by:
  13. PS > .\train.ps1
  14. or
  15. PS > .\train.ps1 yourlang trainfolder
  16. Windows PowerShell 2.0 Download: http://support.microsoft.com/kb/968929
  17. #>
  18. if ($args[0] -and ($args[0] -eq "-?" -or $args[0] -eq "-h" -or $args[0] -eq "-help")) {
  19. Write-Host "Usage: .\train.ps1"
  20. Write-Host " or .\train.ps1 trainfolder yourlang [bootstraplang]"
  21. Write-Host "where trainfolder directory contains all the training data files prefixed with yourlang, e.g.,"
  22. Write-Host "vie.arial.exp0.tif, vie.font_properties, vie.unicharambigs, vie.frequent_words_list, vie.words_list,"
  23. Write-Host "and could be placed directly under Tesseract directory"
  24. exit
  25. }
  26. $trainDir = $args[0]
  27. if (!$trainDir) {
  28. $trainDir = Read-Host "Enter location of the training data folder"
  29. }
  30. $lang = $args[1]
  31. if (!$lang) {
  32. $lang = Read-Host "Enter a language code"
  33. }
  34. if ($lang -eq "" -or $trainDir -eq "") {
  35. Write-Host "Invalid input"
  36. exit
  37. }
  38. if (!(test-path $trainDir)) {
  39. throw "{0} is not a valid path" -f $trainDir
  40. exit
  41. }
  42. $bootstraplang = $args[2]
  43. if (!$bootstraplang) {
  44. $bootstraplang = Read-Host "Enter a bootstrap language code (optional)"
  45. }
  46. echo "=== Generating Tesseract language data for language: $lang ==="
  47. $fullPath = Resolve-Path $trainDir
  48. echo "** Your training images should be in ""$fullPath"" directory."
  49. $al = New-Object System.Collections.ArrayList
  50. echo "Make Box Files"
  51. $boxFiles = ""
  52. Foreach ($entry in dir $trainDir) {
  53. If ($entry.name.toLower().endsWith(".tif") -and $entry.name.startsWith($lang)) {
  54. echo "** Processing image: $entry"
  55. $nameWoExt = [IO.Path]::Combine($trainDir, $entry.BaseName)
  56. $al.Add($nameWoExt)
  57. If ($bootstraplang -eq "") {
  58. $trainCmd = ".\tesseract {0}.tif {0} batch.nochop makebox" -f $nameWoExt
  59. } else {
  60. #Bootstrapping a new character set
  61. $trainCmd = ".\tesseract {0}.tif {0} -l {1} batch.nochop makebox" -f $nameWoExt, $bootstraplang
  62. }
  63. #Should comment out the next line after done with editing the box files to prevent them from getting overwritten in repeated runs.
  64. Invoke-Expression $trainCmd
  65. $boxFiles += $nameWoExt + ".box "
  66. }
  67. }
  68. echo "** Box files should be edited before continuing. **"
  69. echo "Generate .tr Files"
  70. $trFiles = ""
  71. Foreach ($entry in $al) {
  72. $trainCmd = ".\tesseract {0}.tif {0} nobatch box.train" -f $entry
  73. Invoke-Expression $trainCmd
  74. $trFiles += $entry + ".tr "
  75. }
  76. echo "Compute the Character Set"
  77. Invoke-Expression ".\unicharset_extractor -D $trainDir $boxFiles"
  78. echo "Clustering"
  79. Invoke-Expression ".\shapeclustering -F $trainDir\$lang.font_properties -U $trainDir\unicharset $trFiles"
  80. Invoke-Expression ".\mftraining -F $trainDir\$lang.font_properties -U $trainDir\unicharset -O $trainDir\$lang.unicharset $trFiles"
  81. move-item -force -path inttemp -destination $trainDir\$lang.inttemp
  82. move-item -force -path pffmtable -destination $trainDir\$lang.pffmtable
  83. #move-item -force -path Microfeat -destination $trainDir\$lang.Microfeat
  84. Invoke-Expression ".\cntraining $trFiles"
  85. move-item -force -path normproto -destination $trainDir\$lang.normproto
  86. move-item -force -path shapetable -destination $trainDir\$lang.shapetable
  87. echo "Dictionary Data"
  88. Invoke-Expression ".\wordlist2dawg $trainDir\$lang.frequent_words_list $trainDir\$lang.freq-dawg $trainDir\$lang.unicharset"
  89. Invoke-Expression ".\wordlist2dawg $trainDir\$lang.words_list $trainDir\$lang.word-dawg $trainDir\$lang.unicharset"
  90. #Invoke-Expression ".\wordlist2dawg $trainDir\$lang.wordlist-punc $trainDir\$lang.punc-dawg $trainDir\$lang.unicharset"
  91. #Invoke-Expression ".\wordlist2dawg $trainDir\$lang.wordlist-number $trainDir\$lang.number-dawg $trainDir\$lang.unicharset"
  92. echo "The last file (unicharambigs) -- this is to be manually edited"
  93. if (!(test-path $trainDir\$lang.unicharambigs)) {
  94. new-item "$trainDir\$lang.unicharambigs" -type file
  95. set-content -path $trainDir\$lang.unicharambigs -value "v1"
  96. }
  97. echo "Putting it all together"
  98. Invoke-Expression ".\combine_tessdata $trainDir\$lang."