Ocr with abbyyR

来源:互联网 发布:阿里云服务器怎么重装 编辑:程序博客网 时间:2024/06/04 19:39

下载package

install.packages("abbyyR")## devtools::install_github('soodoku/abbyyR')library(abbyyR)# 设置工程目录project.dir <- "E:/Temp/"rm(list = ls())

抓取图片

library(rvest)library(dplyr)url = "http://www.c-gec.cn/a/zuixinhuodong/2017/0803/2531.html"imageUrl <- read_html(url) %>%  html_nodes(xpath = "//div[@id='entrybody']/div/img/@src") %>%  html_textfor(item in imageUrl){  curl_download(item, destfile=paste0(getwd(),"/raw/",basename(item)))}

建立App

首先要在http://ocrsdk.com/建立app应用,获得Id和password

setapp(c("ROcrApp1", "63WTSkZa8OZu2fQGqNh*****"))getAppInfo()

清空App空间

all_tasks <- listTasks()for (i in 1:nrow(all_tasks))   deleteTask(as.character(all_tasks$id[i])) 

监控提交任务

filename <- paste0(project.dir,dir(project.dir, recursive=TRUE))library(progress)pb <- progress_bar$new(format = "  downloading [:bar] :percent\n",                        total = length(filename),                         clear = FALSE, width= 60)tracker <- data.frame(filename=NA, taskid=NA)# Loopj <- 1for(file in filename){  print(file)  tracker[j,] <- c(basename(file), as.character(abbyyR::submitImage(file_path=file)$id))  j <- j + 1  # Prg. bar  pb$tick()  Sys.sleep(1/100)}

执行Ocr

for (i in 1:nrow(tracker))   processDocument(tracker$taskid[i], language="ChinesePRC", profile="documentConversion", exportFormat="xlsx")

任务状态

i <- 1while(TRUE){    i <- nrow(listFinishedTasks())    if (i == length(filename)){        print("All Done!")        break;    }    Sys.sleep(2)}

下载文件

finishedlist <- listFinishedTasks() %>%  mutate(status = as.character(status)) %>%  filter(status == "Completed")results      <- merge(tracker, finishedlist, by.x="taskid", by.y="id")library(curl)setwd(project.dir)for(i in 1:nrow(results)){  print(i)  curl_download(as.character(results$resultUrl[i]), destfile=paste0(getwd(),"/res/",sub(".png","",results$filename[i]),".xlsx"))}

Ocr识别效果

原始图

这里写图片描述

识别结果

这里写图片描述