说干就干,我满怀期待地用github.com/ledongthuc/pdf
包读取了一下Octopus和Humphrey PDF报告,结果什么都没有读取到。查询了一下才知道:github.com/ledongthuc/pdf
包对结构复杂的PDF难以提取到文本信息。而由java语言写成jar包tika则擅长解析结构复杂的PDF,于是我尝试了一下用tika来实现我们伟大的目标:自动化提取Octopus和Humphrey PDF报告中的信息,从而解放双手。幸运的是,这次尝试是成功的。
工欲善其事必先利其器,我们需要使用的工具如下:
配置好以上环境之后,就已经成功一大半了,接下来只需要复制、粘贴,而后运行即可。
程序执行流程如下:
- 开启tika本地服务器
- 读取Names和IDs
- 解析PDF为Html,解析生成的Html
- 保存数据
源码如下:
package main
import (
"bufio"
"io"
"io/ioutil"
"os/exec"
"context"
"fmt"
"encoding/csv"
"github.com/google/go-tika/tika"
"github.com/PuerkitoBio/goquery"
"strings"
"regexp"
"log"
"os"
"time"
)
var (
reg0 = regexp.MustCompile(`MS.+`)
reg1 = regexp.MustCompile(`MD.+`)
reg2 = regexp.MustCompile(`sLV.+`)
)
type Eye struct {
whichEye,result string
}
func ReadFile(fileName string) (res [] string) {
file, err := os.OpenFile(fileName, os.O_RDWR, 0666)
if err != nil {
fmt.Println("Open file error!", err)
return
}
defer file.Close()
stat, err := file.Stat()
if err != nil {
panic(err)
}
var size = stat.Size()
fmt.Println("file size=", size)
buf := bufio.NewReader(file)
for {
line, err := buf.ReadString('\n')
line = strings.TrimSpace(line)
res =append(res,line)
//fmt.Println(line)
if err != nil {
if err == io.EOF {
fmt.Println("File read ok!")
break
} else {
fmt.Println("Read file error!", err)
return
}
}
}
return res
}
func PathExists(path string) (bool, error) {
_, err := os.Stat(path)
if err == nil {
return true, nil
}
if os.IsNotExist(err) {
return false, nil
}
return false, err
}
func ScanFiles(fileDir string) []string {
exist, err := PathExists(fileDir)
if err != nil {
fmt.Printf("get dir error![%v]\n", err)
}
var fileNameList []string
if exist{
files, _ := ioutil.ReadDir(fileDir) //读取目录
for _, onefile := range files { //遍历目录下文件
if !onefile.IsDir() { //是文件
fileName := onefile.Name()
fileNameList = append(fileNameList, fileDir + fileName)
}
}
}
return fileNameList
}
func SaveFile(Name string,ID string, OD Eye, OS Eye){
//这样可以追加写
nfs, err := os.OpenFile("./OctopusData.csv", os.O_RDWR|os.O_CREATE, 0666)
if err != nil {
log.Fatalf("can not create file, err is %+v", err)
}
defer nfs.Close()
nfs.Seek(0, io.SeekEnd)
w := csv.NewWriter(nfs)
//设置属性
w.Comma = ','
w.UseCRLF = true
/*
row := []string{"Name", "ID", "AL_OD", "AL_OS"}
err = w.Write(row)
if err != nil {
log.Fatalf("can not write, err is %+v", err)
}
*/
//这里必须刷新,才能将数据写入文件。
w.Flush()
//一次写入多行
var newContent [][]string
Data :=[]string{Name,ID,OD.whichEye, OD.result,OS.whichEye,OS.result}
newContent = append(newContent, Data)
w.WriteAll(newContent)
}
func startServer() {
// Print Go Version
cmdOutput, err := exec.Command("cmd", "/c","java -jar tika-server-standard-2.3.0.jar").Output()
if err != nil {
log.Fatal(err)
}
fmt.Printf("%s", cmdOutput)
}
func runClient(path string) (OD Eye,OS Eye){
/*
cmd := exec.Command(`java -jar tika-server-standard-2.3.0.jar`)
if runtime.GOOS == "windows" {
cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: true}
}
err := cmd.Run()
if err != nil {
fmt.Println(err)
return
}
*/
// Optionally pass a port as the second argument.
f, err := os.Open(path)
if err != nil {
log.Fatal(err)
}
defer f.Close()
//fmt.Println(f.Name())
client := tika.NewClient(nil, "http://localhost:9998")
body, err := client.Parse(context.Background(), f)
if err != nil {
fmt.Println(err)
}
//fmt.Println(body)
result := strings.Split(HtmlParser(body),"\n")
OD.whichEye,OS.whichEye = "右","左"
OD.result,OS.result = result[0],result[1]
return OD,OS
/*
file, err := os.Create("body.html")
if err != nil {
fmt.Println(err)
return
}
defer file.Close()
file.WriteString(body)
*/
}
func HtmlParser(html string)(result string){
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
//fmt.Println(doc)
if err != nil {
log.Fatal(err)
}
// Find the review items
doc.Find("div").Each(func(i int, selection *goquery.Selection) {
text := selection.Find("p").Text()
result =result + reg0.FindString(text) + " " + reg1.FindString(text) + " " + reg2.FindString(text) +"\n"
})
fmt.Printf("%s", result)
return result
}
func main(){
go startServer()
Names := ReadFile("./Names.txt")
IDs := ReadFile("./IDs.txt")
title := Eye{"眼别","result"}
noData :=Eye{"",""}
SaveFile("Name","ID",title,title)
//fmt.Println(Names,IDs)
//WhichEye :=ReadFile("./WhichEye.txt")
time.Sleep(time.Second)
for i,Name := range Names{
if Name !=""{
Dir:= "./Exams/"+ Name + IDs[i]+"/Octopus/"
files:= ScanFiles(Dir)
if l:=len(files); l!=0{
OD,OS := runClient(files[l-1])
SaveFile(Name,IDs[i],OD,OS)
}else{
SaveFile("","",noData,noData)
}
}else{
SaveFile("","",noData,noData)
}
}
}
package main
import (
"bufio"
"io"
"io/ioutil"
"os/exec"
"context"
"fmt"
"encoding/csv"
"github.com/google/go-tika/tika"
"github.com/PuerkitoBio/goquery"
"strings"
"regexp"
"log"
"os"
"time"
)
var (
reg0 = regexp.MustCompile(`GHT:.+`)
reg1 = regexp.MustCompile(`VFI:.+`)
reg2 = regexp.MustCompile(`MD30-2:.+`)
reg3 = regexp.MustCompile(`PSD30-2:.+`)
)
type Eye struct {
whichEye,result string
}
func ReadFile(fileName string) (res [] string) {
file, err := os.OpenFile(fileName, os.O_RDWR, 0666)
if err != nil {
fmt.Println("Open file error!", err)
return
}
defer file.Close()
stat, err := file.Stat()
if err != nil {
panic(err)
}
var size = stat.Size()
fmt.Println("file size=", size)
buf := bufio.NewReader(file)
for {
line, err := buf.ReadString('\n')
line = strings.TrimSpace(line)
res =append(res,line)
//fmt.Println(line)
if err != nil {
if err == io.EOF {
fmt.Println("File read ok!")
break
} else {
fmt.Println("Read file error!", err)
return
}
}
}
return res
}
func PathExists(path string) (bool, error) {
_, err := os.Stat(path)
if err == nil {
return true, nil
}
if os.IsNotExist(err) {
return false, nil
}
return false, err
}
func ScanFiles(fileDir string) []string {
exist, err := PathExists(fileDir)
if err != nil {
fmt.Printf("get dir error![%v]\n", err)
}
var fileNameList []string
if exist{
files, _ := ioutil.ReadDir(fileDir) //读取目录
for _, onefile := range files { //遍历目录下文件
if !onefile.IsDir() { //是文件
fileName := onefile.Name()
fileNameList = append(fileNameList, fileDir + fileName)
}
}
}
return fileNameList
}
func SaveFile(Name string,ID string, OD Eye, OS Eye){
//这样可以追加写
nfs, err := os.OpenFile("./HumphreyData.csv", os.O_RDWR|os.O_CREATE, 0666)
if err != nil {
log.Fatalf("can not create file, err is %+v", err)
}
defer nfs.Close()
nfs.Seek(0, io.SeekEnd)
w := csv.NewWriter(nfs)
//设置属性
w.Comma = ','
w.UseCRLF = true
/*
row := []string{"Name", "ID", "AL_OD", "AL_OS"}
err = w.Write(row)
if err != nil {
log.Fatalf("can not write, err is %+v", err)
}
*/
//这里必须刷新,才能将数据写入文件。
w.Flush()
//一次写入多行
var newContent [][]string
Data :=[]string{Name,ID,OD.whichEye, OD.result,OS.whichEye,OS.result}
newContent = append(newContent, Data)
w.WriteAll(newContent)
}
func startServer() {
// Print Go Version
cmdOutput, err := exec.Command("cmd", "/c","java -jar tika-server-standard-2.3.0.jar").Output()
if err != nil {
log.Fatal(err)
}
fmt.Printf("%s", cmdOutput)
}
func runClient(path string) (OD Eye,OS Eye){
/*
cmd := exec.Command(`java -jar tika-server-standard-2.3.0.jar`)
if runtime.GOOS == "windows" {
cmd.SysProcAttr = &syscall.SysProcAttr{HideWindow: true}
}
err := cmd.Run()
if err != nil {
fmt.Println(err)
return
}
*/
// Optionally pass a port as the second argument.
f, err := os.Open(path)
if err != nil {
log.Fatal(err)
}
defer f.Close()
//fmt.Println(f.Name())
client := tika.NewClient(nil, "http://localhost:9998")
body, err := client.Parse(context.Background(), f)
if err != nil {
fmt.Println(err)
}
//fmt.Println(body)
result := strings.Split(HtmlParser(body),"\n")
OD.whichEye,OS.whichEye = "右","左"
OD.result,OS.result = result[1],result[0]
return OD,OS
/*
file, err := os.Create("body.html")
if err != nil {
fmt.Println(err)
return
}
defer file.Close()
file.WriteString(body)
*/
}
func HtmlParser(html string)(result string){
doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
//fmt.Println(doc)
if err != nil {
log.Fatal(err)
}
// Find the review items
doc.Find("div").Each(func(i int, selection *goquery.Selection) {
text := selection.Find("p").Text()
result =result + reg0.FindString(text) + " " + reg1.FindString(text) + " " + reg2.FindString(text) + " " + reg3.FindString(text) +"\n"
})
fmt.Printf("%s", result)
return result
}
func main(){
go startServer()
Names := ReadFile("./Names.txt")
IDs := ReadFile("./IDs.txt")
title := Eye{"眼别","result"}
noData :=Eye{"",""}
SaveFile("Name","ID",title,title)
//fmt.Println(Names,IDs)
//WhichEye :=ReadFile("./WhichEye.txt")
time.Sleep(time.Second)
for i,Name := range Names{
if Name !=""{
Dir:= "./Exams/"+ Name + IDs[i]+"/Humphrey/"
files:= ScanFiles(Dir)
if l:=len(files); l !=0{
OD,OS := runClient(files[l-1])
SaveFile(Name,IDs[i],OD,OS)
}else{
SaveFile("","",noData,noData)
}
}else{
SaveFile("","",noData,noData)
}
}
}
用Markdown 5 记笔记、排版公众号以及写Blog真的是太方便了,文末推荐一波Markdown的软件:
由java写成的PDF文件解析包:https://tika.apache.org/download.html ↩︎
Go语言用来与tika交互的程序包:https://github.com/google/go-tika ↩︎
参考CSDN配置Java环境 ↩︎
参考CSDN配置Go环境:https://golang.google.cn/doc/install?download=go1.12.6.windows-amd64.msi ↩︎
Markdown是一种轻量级的标记语言,非常易学,几分钟就能学会基本操作。 ↩︎
下载链接:https://joplinapp.org/ ↩︎
下载链接:https://neuxlab.cn/ ↩︎