当前位置：首页 > news >正文

上海建筑安全协会网站网络管理系统的主要功能

news 2025/12/17 9:39:57

上海建筑安全协会网站,网络管理系统的主要功能,wordpress上传pdf,二级域名和一级域名优化难度前段时间我写了Python识别拖放的PDF文件再转成文本文件-CSDN博客最近有2点更新#xff0c;一是有一些pdf文件转换出来的图片是横的#xff0c;这样也可以识别文字#xff0c;但是可能会影响效果#xff0c;另一个是发现有一些文字识别不出来#xff0c;看了关于提高Padd…前段时间我写了Python识别拖放的PDF文件再转成文本文件-CSDN博客最近有2点更新一是有一些pdf文件转换出来的图片是横的这样也可以识别文字但是可能会影响效果另一个是发现有一些文字识别不出来看了关于提高PaddleOCR识别准确率的一些优化一_如何提高paddleocr识别准确率-CSDN博客发现是图片文件的尺寸太大了为此将其缩小一半再识别。确实提高了识别率。代码 # -*- coding: utf-8 -*-Created on Sun Aug 25 10:42:39 2024author: YBK import tkinter as tk import windnd from tkinter.messagebox import showinfo import os from PIL import Image import fitz from fitz import Document as openPDF import time import re from paddleocr import PaddleOCR import subprocessdef dec_to_36(num):base [str(x) for x in range(10)] [chr(x) for x in range(ord(A),ord(A)26)]# 前者把 0 ~ 9 转换成字符串存进列表 base 里后者把 A ~ Z 存进列表l []if num0:return -dec_to_36(abs(num))while True:num,rem divmod(num,36) # 求商和留余数l.append(base[rem])if num 0:return .join(l[::-1])def nowtime_to_str():#将当前时间戳转化为36进制约6位字符减少文件名长度unix_timestamp int(time.time())return(dec_to_36(unix_timestamp))def pdf2pic(path, pic_path):# 从pdf中提取图片:param path: pdf的路径:param pic_path: 图片保存的路径:return:t0 time.perf_counter()# 使用正则表达式来查找图片checkXO r/Type(? */XObject)checkIM r/Subtype(? */Image)# 打开pdfdoc openPDF(path)# 图片计数imgcount 0lenXREF doc.xref_length()# 打印PDF的信息print(文件名:{}, 页数: {}, 对象: {}.format(path, len(doc), lenXREF - 1))# 遍历每一个对象for i in range(1, lenXREF):# 定义对象字符串text doc.xref_object(i)isXObject re.search(checkXO, text)# 使用正则表达式查看是否是图片isImage re.search(checkIM, text)# 如果不是对象也不是图片则continueif not isXObject or not isImage:continueimgcount 1# 根据索引生成图像pix fitz.Pixmap(doc, i)# 根据pdf的路径生成图片的名称# new_name path.replace(\\, _) _img{}.png.format(imgcount)# new_name new_name.replace(:, )new_name os.path.basename(path).replace(.pdf, _) img str(imgcount).zfill(3) .png# 如果pix.n5,可以直接存为PNGif pix.n 5:pix._writeIMG(os.path.join(pic_path, new_name),1,10)# 否则先转换CMYKelse:pix0 fitz.Pixmap(fitz.csRGB, pix)pix0._writeIMG(os.path.join(pic_path, new_name),1,10)pix0 None# 释放资源pix Noneimage Image.open(os.path.join(pic_path, new_name))#对于尺寸大于2000 * 2000的图像缩放至h * 0.5w * 0.5识别准确率有所提升if image.width 2000 or image.height 2000:new_image image.resize((int(image.width * 0.5), int(image.height * 0.5)))new_image.save(os.path.join(pic_path, new_name))print(缩小图片尺寸)new_image.close()image Image.open(os.path.join(pic_path, new_name))#对于图片宽度大于高度左旋转if image.width image.height: rotated_img image.transpose(Image.ROTATE_90)print(左旋转)rotated_img.save(os.path.join(pic_path, new_name)) image.close()t1 time.perf_counter()print(运行时间:{}s.format(t1 - t0))print(提取了{}张图片.format(imgcount)) def get_file_size(file_path):# 获取文件的大小单位为字节file_size os.stat(file_path).st_sizereturn file_size def dragged_files(files):fileurl if len(files) 1:# print(请拖放一个文件)showinfo(提示,请拖放一个文件)else:# print(files[0].decode(gbk))fileurl files[0].decode(gbk)# print(os.path.splitext(fileurl)[1])if fileurl ! and os.path.splitext(fileurl)[1] .pdf:pdfpath fileurlfilename0 os.path.basename(fileurl).replace(.pdf,) nowtime_to_str()# filename0 用于生成文件夹和文件名为了不重复在后面加入编码后的时间戳pic_path fe:\\临时文件夹\\{filename0}\\if not os.path.exists(pic_path):os.mkdir(pic_path)m pdf2pic(pdfpath, pic_path)pngpath pic_pathouttxtpath e:\\临时文件夹\\filename0.txtocr PaddleOCR(use_angle_clsTrue, langch) # need to run only once to download and load model into memorylines []for filename in os.listdir(pngpath):img_path pngpathfilenameresult ocr.ocr(img_path, clsTrue)print(img_path)# image Image.open(img_path).convert(RGB)if result[0] is not None:boxes [detection[0] for line in result for detection in line] # Nested loop addedtxts [detection[1][0] for line in result for detection in line] # Nested loop addedscores [detection[1][1] for line in result for detection in line] # Nested loop addedfor box, txt, score in zip(boxes, txts, scores):if score 0.7:# lines.append(txt.replace(\n,))lines.append(txt\n)# lines.append(\n)with open(outtxtpath, w, encodingutf-8) as f:f.writelines(line for line in lines)subprocess.run([notepad.exe, outtxtpath], checkTrue)if __name__ __main__:rootWindow tk.Tk()rootWindow.title(拖放PDF文件识别文字)rootWindow.geometry(300x120)windnd.hook_dropfiles(rootWindow , funcdragged_files)rootWindow.mainloop()

查看全文

http://www.w-s-a.com/news/384890/