1,下面是我最近编写的代码,提供给大家参考,功能都有注释
这里我主要是使用Aspose.pdf将pdf电子发票上边的二维码图片截取下来使用ZXing来解码获取一些内容解释增值税发票解码字段说明,然后使用Aspose.pdf获取电子发票pdf文件的文件内容信息(这里获取供应商信息和发票是否被篡改的判断),以及获取pdf文件上所有文本内容,内容是带格式的,目前还没有想到它文本内容的排序规则。
关于获取pdf文件文本方式,我写了多种,你们可以都试试看
ZXingdll下载链接ZXing.dll 当然百度也有很多
Asposedll下载链接Asposedll 百度也有很多 我这边用版本是10.1.0.0
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using Aspose.Pdf;
using Aspose.Pdf.Text;
using Aspose.Pdf.Facades;
using System.IO;
using System.Drawing.Imaging;
using System.Drawing;
using ZXing;
using System.Security.Cryptography.X509Certificates;
namespace FlexSystem.V11.SHUtils
{
public class AspPdfHelper
{
static BarcodeReader barcodeReader = new BarcodeReader();
/// <summary>
/// 10 - 增值税电子普通发票,04 - 增值税普通发票,01 - 增值税专用发票
/// </summary>
public static string[] CodeArry = new string[] { "10", "04", "01" };
/// <summary>
/// 读取pdf文件上的二维码图片解码
/// </summary>
/// <param name="filePath">pdf文件绝对路径</param>
/// <param name="fileSavePath">pdf文件截取的二维码图片存放位置</param>
/// <remarks>可能是一个多页的pdf所以字典集合 int 代表页号 Dictionary 代表解码出来的文本信息键值对</remarks>
/// <returns>返回二维码解码后文本字典集合</returns>
public static Dictionary<int, Dictionary<string, string>> DecodePdfPic(string filePath, string fileSavePath)
{
Dictionary<int, Dictionary<string, string>> dicPagePdf = new Dictionary<int, Dictionary<string, string>>();
Dictionary<int, Dictionary<string, string>> dicPagePdf2 = new Dictionary<int, Dictionary<string, string>>();
Dictionary<int, Dictionary<string, string>> dicPagePdf3 = new Dictionary<int, Dictionary<string, string>>();
if (File.Exists(filePath) && Path.GetExtension(filePath) == ".pdf")
{
#region 获取发票上边的二维码图片和文件信息,解码获取二维码文本内容及文件信息放入不同的字典集合,最后合并到一个字典集合
using (Document pdfDocument = new Document(filePath))
{
int count = 1;
for (int pagecount = 1; pagecount <= pdfDocument.Pages.Count; pagecount++)
{
Dictionary<string, string> dicPdfPic = new Dictionary<string, string>();
if (pdfDocument.Pages[pagecount].Resources.Images != null)
{
for (int piccount = 1; piccount <= pdfDocument.Pages[pagecount].Resources.Images.Count; piccount++)
{
XImage image = pdfDocument.Pages[pagecount].Resources.Images[piccount];
FileHelper.CheckDirectory(fileSavePath, true);
using (FileStream outputImage = new FileStream(fileSavePath + "\\" + DateTime.Now.ToString("yyyyMMddHHmmss") + count.ToString() + ".jpg", FileMode.Create))
{
image.Save(outputImage, ImageFormat.Jpeg);
Bitmap bitmap = new Bitmap(outputImage);
var barcodeResult = barcodeReader.Decode(bitmap);
if (barcodeResult != null)
{
/*pdfBarText = barcodeResult.BarcodeFormat.ToString() + "," + */
#region 将解码出来的内容放入字典集合中
string resultStr = barcodeResult.Text.Trim();
int lastIndexs = barcodeResult.Text.LastIndexOf(',');
if (lastIndexs == barcodeResult.Text.Length - 1)
resultStr = barcodeResult.Text.Substring(0, lastIndexs);
string[] resultStrs = resultStr.Split(',');
dicPdfPic["Sequence"] = pagecount.ToString();
dicPdfPic["Code1"] = resultStrs[0];
//10 - 增值税电子普通发票,04 - 增值税普通发票,01 - 增值税专用发票
for (int i = 0; i < CodeArry.Length; i++)
{
if (resultStrs[1] == CodeArry[i])
break;
if (resultStrs[1] != CodeArry[i] && i == 2)
{
dicPdfPic["发票种类代码"] = "发票种类代码错误,请查验";
dicPdfPic["Error"] = "发票种类代码错误,请查验";
dicPagePdf[pagecount - 1] = dicPdfPic;
return dicPagePdf;
}
}
dicPdfPic["发票种类代码"] = resultStrs[1];
//发票代码
dicPdfPic["发票代码"] = resultStrs[2];
//发票号码
dicPdfPic["发票号码"] = resultStrs[3];
//开票金额
dicPdfPic["开票金额"] = resultStrs[4];
//开票日期
dicPdfPic["开票日期"] = resultStrs[5].Substring(0, 4) + "-" + resultStrs[5].Substring(4, 2) + "-" + resultStrs[5].Substring(6, 2);
//发票校验码
dicPdfPic["发票校验码"] = resultStrs[6];
//专用发票 普通发票多出来的属性
if (resultStrs.Length == 8)
dicPdfPic["Code7"] = resultStrs[7];
#endregion
}
}
count++;
}
}
dicPagePdf[pagecount - 1] = dicPdfPic;
}
using (PdfFileSignature pdfSign = new PdfFileSignature(pdfDocument))
{
IList<string> names = (IList<string>)pdfSign.GetSignNames();
for (int i = 0; i < names.Count; i++)
{
Dictionary<string, string> dicPdf = new Dictionary<string, string>();
dicPdf["Sequence"] = (i + 1).ToString();
dicPdf["签名"] = (string)names[i];
dicPdf["revision"] = Convert.ToString(pdfSign.GetRevision((string)names[i]));
dicPdf["verifysigned"] = Convert.ToString(pdfSign.VerifySigned((string)names[i]));
if (dicPdf["verifysigned"].ToUpper() == "FALSE")
{
dicPdf["Error"] = "发票检查有误,请查验";
dicPagePdf2[i] = dicPdf;
return dicPagePdf2;
}
dicPdf["location"] = pdfSign.GetLocation((string)names[i]);
dicPdf["datatime"] = pdfSign.GetDateTime((string)names[i]).ToString("yyyy-MM-dd HH:mm:ss");
using (Stream cerStream = pdfSign.ExtractCertificate((string)names[i]))
{
byte[] bytes = new byte[cerStream.Length];
cerStream.Read(bytes, 0, bytes.Length);
X509Certificate2 x509cer = new X509Certificate2(bytes);
Dictionary<string,string> dicExample = SplitStr(x509cer.Subject);
foreach (string item in dicExample.Keys)
{
dicPdf[item] = dicExample[item];
}
dicPdf["证书生效的本地日期"] = x509cer.NotBefore.ToString("yyyy-MM-dd HH:mm:ss");
dicPdf["证书失效的本地日期"] = x509cer.NotAfter.ToString("yyyy-MM-dd HH:mm:ss");
dicPdf["证书序列号"] = x509cer.SerialNumber;
}
dicPdf["totalvision"] = Convert.ToString(pdfSign.GetTotalRevision());
dicPagePdf2[i] = dicPdf;
}
}
}
#endregion
#region 将二维码信息和文件信息合并
foreach (int indexs in dicPagePdf2.Keys)
{
Dictionary<string, string> dicPdfResult = new Dictionary<string, string>();
if (dicPagePdf.ContainsKey(indexs))
{
if (dicPagePdf[indexs]["Sequence"] == dicPagePdf2[indexs]["Sequence"])
{
dicPdfResult = MergeDictionary(dicPagePdf[indexs], dicPagePdf2[indexs]);
}
}
dicPagePdf3[indexs] = dicPdfResult;
}
#endregion
return dicPagePdf3;
}
else
{
dicPagePdf[0] = new Dictionary<string, string>()
{
["Error"] = "this filePath is not exists or file's extension is not pdf"
};
return dicPagePdf;
}
}
/// <summary>
/// 字典集合数据拼接
/// </summary>
/// <param name="first">第一个字典集合</param>
/// <param name="second">第二个字典集合</param>
/// <returns>返回拼接好的字典集合</returns>
public static Dictionary<string, string> MergeDictionary(Dictionary<string, string> first, Dictionary<string, string> second)
{
if (first == null) first = new Dictionary<string, string>();
if (second == null) return first;
//相对于第一种只是修改了遍历的方法
foreach (string key in second.Keys)
{
if (!first.ContainsKey(key))
first.Add(key, second[key]);
}
return first;
}
/// <summary>
/// 分割带,字符串,再将数组元素按照=分割存入字典中
/// </summary>
/// <param name="Str">字符串</param>
/// <remarks>例如 CN=滴滴出行科技有限公司, OU=电子发票, O=911201163409833307, C=CN</remarks>
/// <returns></returns>
public static Dictionary<string, string> SplitStr(string Str)
{
if (string.IsNullOrEmpty(Str))
return null;
Dictionary<string, string> dicArStr = new Dictionary<string, string>();
string[] ArStr = Str.Split(',');
for (int i = 0; i < ArStr.Length; i++)
{
string[] ArStrs = ArStr[i].Split('=');
dicArStr[ArStrs[0].Trim()] = ArStrs[1].Trim();
}
return dicArStr;
}
/// <summary>
/// 读取pdf文件上的文本内容
/// </summary>
/// <param name="filePath">pdf文件绝对路径</param>
/// <returns>返回pdf文件文本内容</returns>
public static string ReadPdfContent(string filePath)
{
string content = string.Empty;
//方式1 保留了原有pdf文件文字的排列格式
PdfExtractor pdfExtractor = new PdfExtractor();
pdfExtractor.BindPdf(filePath);
pdfExtractor.ExtractText();
MemoryStream tempMemoryStream = new MemoryStream();
pdfExtractor.GetText(tempMemoryStream);
// Specify Unicode encoding type in StreamReader constructor
using (StreamReader streamReader = new StreamReader(tempMemoryStream, Encoding.Unicode))
{
streamReader.BaseStream.Seek(0, SeekOrigin.Begin);
content = streamReader.ReadToEnd();
}
pdfExtractor.Dispose();
//方式2 保留了原有pdf文件文字的排列格式
//Document pdfDocument = new Document(filePath);
//TextAbsorber text = new TextAbsorber();
//text.Visit(pdfDocument);
//content = text.Text;
//pdfDocument.Dispose();
//text.Dispose();
//方式3 没有原有的pdf格式全部都是整合在一起的文本
//TextFragmentAbsorber tf = new TextFragmentAbsorber();
//tf.Visit(pdfDocument);
//for (int i = 1; i <= tf.TextFragments.Count; i++)
//{
// content += tf.TextFragments[i].Text;
//}
return content;
}
}
}