tesseract train python_file
来源:互联网 发布:淘宝店铺提高流量 编辑:程序博客网 时间:2024/06/07 13:26
# tesseract-trainer
This is a set of two tools used to generate OCR training files for Tesseract. It is particularly designed for image files with small numbers of characters. It will help you create box files, assuming the name of the image file reflects the text contained in the image.
To run the tesseract trainer, you need to point it at a directory containing a set of image files and a set of box files with corresponding file names. e.g. You might have a directory containing:
- asdf.png
- asdf.box
- qwerty.png
- qwerty.box
This is a set of two tools used to generate OCR training files for Tesseract. It is particularly designed for image files with small numbers of characters. It will help you create box files, assuming the name of the image file reflects the text contained in the image.
To run the tesseract trainer, you need to point it at a directory containing a set of image files and a set of box files with corresponding file names. e.g. You might have a directory containing:
- asdf.png
- asdf.box
- qwerty.png
- qwerty.box
Where the file names correspond to the characters that the image contains.
from PIL import Imageimport subprocessimport osimport numpy#Steps to take before running:#Set TESSDATA_PREFIX to correct directory#Put image and box files together in the same directory#Label each corresponding file with the same filenamesclass TesseractTrainer():def __init__(self):self.languageName = "eng"self.fontName = "captchaFont"self.directory = "/Users/ryan/Documents/tesseract-trainer/images"self.trainingList = Noneself.boxList = Nonedef runAll(self):self.createFontFile()self.cleanImages()self.renameFiles()self.extractUnicode()self.runShapeClustering()self.runMfTraining()self.runCnTraining()self.createTessData()def cleanImages(self):print("CLEANING IMAGES...")files = os.listdir(self.directory)for fileName in files:if fileName.endswith("jpg") or fileName.endswith("jpeg") or fileName.endswith("png"):image = Image.open(self.directory+"/"+fileName)#Set a threshold value for the image, and saveimage = image.point(lambda x: 0 if x<250 else 255)(root, ext) = os.path.splitext(fileName)newFilePath = root+".tiff"image.save(self.directory+"/"+newFilePath)#Looks for box files, uses the box filename to find the corresponding#.tiff file. Renames all files with the appropriate "<language>.<font>.exp<N>" filenamedef renameFiles(self):files = os.listdir(self.directory)boxString = ""i = 0for fileName in files:if fileName.endswith(".box"):(root, ext) = os.path.splitext(fileName)tiffFile = self.languageName+"."+self.fontName+".exp"+str(i)+".tiff"boxFile = self.languageName+"."+self.fontName+".exp"+str(i)+".box"os.rename(self.directory+"/"+root+".tiff", self.directory+"/"+tiffFile)os.rename(self.directory+"/"+root+".box", self.directory+"/"+boxFile)boxString += " "+boxFileself.createTrainingFile(self.languageName+"."+self.fontName+".exp"+str(i))i += 1return boxString#Creates a training file for a single tiff/box pair#Called by renameFilesdef createTrainingFile(self, prefix):print("CREATING TRAINING DATA...")currentDir = os.getcwd()os.chdir(self.directory)p = subprocess.Popen(["tesseract", prefix+".tiff", prefix, "nobatch", "box.train"], stdout=subprocess.PIPE,stderr=subprocess.PIPE)returnValue = stdout_value = p.communicate()[1]returnValue = returnValue.decode("utf-8")if "Empty page!!" in returnValue:os.chdir(self.directory)subprocess.call(["tesseract", "-psm", "7", prefix+".tiff", prefix, "nobatch", "box.train"])os.chdir(currentDir)def extractUnicode(self):currentDir = os.getcwd()print("EXTRACTING UNICODE...")boxList = self.getBoxFileList()boxArr = boxList.split(" ")boxArr.insert(0, "unicharset_extractor")boxArr = [i for i in boxArr if i != '']os.chdir(self.directory)p = subprocess.Popen(boxArr)p.wait()os.chdir(currentDir)def createFontFile(self):currentDir = os.getcwd()os.chdir(self.directory)fname = self.directory+"/font_properties"with open(fname, 'w') as fout: fout.write(self.fontName+" 0 0 0 0 0")os.chdir(currentDir)def runShapeClustering(self):print("RUNNING SHAPE CLUSTERING...")#shapeclustering -F font_properties -U unicharset eng.captchaFont.exp0.tr...self.getTrainingFileList()shapeCommand = self.trainingList.split(" ")shapeCommand.insert(0, "shapeclustering")shapeCommand.insert(1, "-F")shapeCommand.insert(2, "font_properties")shapeCommand.insert(3, "-U")shapeCommand.insert(4, "unicharset")shapeCommand = [i for i in shapeCommand if i != '']currentDir = os.getcwd()os.chdir(self.directory)p = subprocess.Popen(shapeCommand)p.wait()os.chdir(currentDir)def runMfTraining(self):#mftraining -F font_properties -U unicharset eng.captchaFont.exp0.tr...print("RUNNING MF CLUSTERING...")self.getTrainingFileList()mfCommand = self.trainingList.split(" ")mfCommand.insert(0, "mftraining")mfCommand.insert(1, "-F")mfCommand.insert(2, "font_properties")mfCommand.insert(3, "-U")mfCommand.insert(4, "unicharset")mfCommand = [i for i in mfCommand if i != '']currentDir = os.getcwd()os.chdir(self.directory)p = subprocess.Popen(mfCommand)p.wait()os.chdir(currentDir)def runCnTraining(self):#cntraining -F font_properties -U unicharset eng.captchaFont.exp0.tr...print("RUNNING MF CLUSTERING...")self.getTrainingFileList()cnCommand = self.trainingList.split(" ")cnCommand.insert(0, "cntraining")cnCommand.insert(1, "-F")cnCommand.insert(2, "font_properties")cnCommand.insert(3, "-U")cnCommand.insert(4, "unicharset")cnCommand = [i for i in cnCommand if i != '']currentDir = os.getcwd()os.chdir(self.directory)p = subprocess.Popen(cnCommand)p.wait()os.chdir(currentDir)def createTessData(self):print("CREATING TESS DATA...")#Rename all files and run combine_tessdata <language>.currentDir = os.getcwd()os.chdir(self.directory)os.rename("unicharset", self.languageName+".unicharset")os.rename("shapetable", self.languageName+".shapetable")os.rename("inttemp", self.languageName+".inttemp")os.rename("normproto", self.languageName+".normproto")os.rename("pffmtable", self.languageName+".pffmtable")p = subprocess.Popen(["combine_tessdata", self.languageName+"."])p.wait()os.chdir(currentDir)def getBoxFileList(self):if self.boxList is not None:return self.boxListself.boxList = ""files = os.listdir(self.directory)commandString = "unicharset_extractor"filesFound = Falsefor fileName in files:if fileName.endswith(".box"):filesFound = Trueself.boxList += " "+fileNameif not filesFound:self.boxList = Nonereturn self.boxList#Retrieve a list of created training files, caches #the list, so this only needs to be done once.def getTrainingFileList(self):if self.trainingList is not None:return self.trainingListself.trainingList = ""files = os.listdir(self.directory)commandString = "unicharset_extractor"filesFound = Falsefor fileName in files:if fileName.endswith(".tr"):filesFound = Trueself.trainingList += " "+fileNameif not filesFound:self.trainingList = Nonereturn self.trainingListtrainer = TesseractTrainer()trainer.runAll()
阅读全文
0 0
- tesseract train python_file
- python_file
- How to use the tools provided to train Tesseract for a new language.
- train
- train
- train
- train
- tesseract-ocr
- tesseract训练
- tesseract杂项
- Tesseract + opencv
- ocr tesseract
- Tesseract OCR
- Tesseract OCR
- Tesseract related
- tesseract 训练-
- 初识Tesseract
- Tesseract介绍
- Thread--01在线程池使用Callable和Runnable的区别以及如何关闭线程
- Linux各个目录的作用及内容
- java编程思想学习笔记(第七章:复用类)
- tp3.2中excel表格数据导入数据库
- C#读写EXCEL源码提示“office检测到此文件存在一个问题。为帮助保护您的计算机,不能打开此文件。 ”的解决
- tesseract train python_file
- 浏览器兼容
- 聊聊RESTful
- linux+cmake+opencv+运行
- android 资源uri
- SSH不能连接并提示REMOTE HOST IDENTIFICATION HAS CHANGED解决(mac)
- CSS设置输入框默认文字颜色(webkit-input-placeholder等)
- android 对话框样式
- Python下定义一个函数来显示相应的进度条