响应式网站 做搜索推广缺点,剑三做月饼活动网站,专业建设特色,免费公司注册网站吗pdfplumber 的特点
1、它是一个纯 python 第三方库#xff0c;适合 python 3.x 版本 2、它用来查看pdf各类信息#xff0c;能有效提取文本、表格 3、它不支持修改或生成pdf#xff0c;也不支持对pdf扫描件的处理
import glob
import pdfplumber
import re
from collection…pdfplumber 的特点
1、它是一个纯 python 第三方库适合 python 3.x 版本 2、它用来查看pdf各类信息能有效提取文本、表格 3、它不支持修改或生成pdf也不支持对pdf扫描件的处理
import glob
import pdfplumber
import re
from collections import defaultdict
import jsonclass PDFProcessor:def __init__(self, filepath):self.filepath filepath#打开文档注意存放的位置self.pdf pdfplumber.open(filepath)self.all_text defaultdict(dict)self.allrow 0self.last_num 0def check_lines(self, page, top, buttom):# 文本数据lines page.extract_words()[::]text last_top 0last_check 0for l in range(len(lines)):each_line lines[l]check_re (?:。||单位元|单位万元|币种人民币|\d|报告(?:全文)?(?:修订版|修订稿|更正后)?)$if top and buttom :if abs(last_top - each_line[top]) 2:text text each_line[text]#elif last_check 0 and (page.height * 0.85 - each_line[top]) 0 and not re.search(check_re, text):elif last_check 0 and (page.height * 0.9 - each_line[top]) 0 and not re.search(check_re, text):text text each_line[text]else:text text \n each_line[text]elif top :if each_line[top] buttom:if abs(last_top - each_line[top]) 2:text text each_line[text]elif last_check 0 and (page.height * 0.85 - each_line[top]) 0 and not re.search(check_re,text):text text each_line[text]else:text text \n each_line[text]else:if each_line[top] top and each_line[top] buttom:if abs(last_top - each_line[top]) 2:text text each_line[text]elif last_check 0 and (page.height * 0.85 - each_line[top]) 0 and not re.search(check_re,text):text text each_line[text]else:text text \n each_line[text]last_top each_line[top]last_check each_line[x1] - page.width * 0.85return textdef drop_empty_cols(self, data):# 删除所有列为空数据的列transposed_data list(map(list, zip(*data)))filtered_data [col for col in transposed_data if not all(cell is for cell in col)]result list(map(list, zip(*filtered_data)))return resultstaticmethoddef keep_visible_lines(obj):If the object is a rect type, keep it only if the lines are visible.A visible line is the one having non_stroking_color not null.if obj[object_type] rect:if obj[non_stroking_color] is None:return Falseif obj[width] 1 and obj[height] 1:return False# return obj[width] 1 and obj[height] 1 and obj[non_stroking_color] is not Noneif obj[object_type] char:return obj[stroking_color] is not None and obj[non_stroking_color] is not Nonereturn Truedef extract_text_and_tables(self, page):buttom 0page page.filter(self.keep_visible_lines)tables page.find_tables()if len(tables) 1:# 表格数据count len(tables)for table in tables:if table.bbox[3] buttom:passelse:count - 1top table.bbox[1]text self.check_lines(page, top, buttom)text_list text.split(\n)for _t in range(len(text_list)):self.all_text[self.allrow] {page: page.page_number, allrow: self.allrow,type: text, inside: text_list[_t]}self.allrow 1buttom table.bbox[3]new_table table.extract()r_count 0for r in range(len(new_table)):row new_table[r]if row[0] is None:r_count 1for c in range(len(row)):if row[c] is not None and row[c] not in [, ]:if new_table[r - r_count][c] is None:new_table[r - r_count][c] row[c]else:new_table[r - r_count][c] row[c]new_table[r][c] Noneelse:r_count 0end_table []for row in new_table:if row[0] ! None:cell_list []cell_check Falsefor cell in row:if cell ! None:cell cell.replace(\n, )else:cell if cell ! :cell_check Truecell_list.append(cell)if cell_check True:end_table.append(cell_list)end_table self.drop_empty_cols(end_table)for row in end_table:self.all_text[self.allrow] {page: page.page_number, allrow: self.allrow,type: excel, inside: str(row)}# self.all_text[self.allrow] {page: page.page_number, allrow: self.allrow, type: excel,# inside: .join(row)}self.allrow 1if count 0:text self.check_lines(page, , buttom)text_list text.split(\n)for _t in range(len(text_list)):self.all_text[self.allrow] {page: page.page_number, allrow: self.allrow,type: text, inside: text_list[_t]}self.allrow 1else:#文本数据text self.check_lines(page, , )text_list text.split(\n)for _t in range(len(text_list)):self.all_text[self.allrow] {page: page.page_number, allrow: self.allrow,type: text, inside: text_list[_t]}self.allrow 1first_re [^计](?:报告(?:全文)?(?:修订版|修订稿|更正后)?)$end_re ^(?:\d|\\|\/|第|共|页|-|_| ){1,}if self.last_num 0:try:first_text str(self.all_text[1][inside])end_text str(self.all_text[len(self.all_text) - 1][inside])if re.search(first_re, first_text) and not [ in end_text:self.all_text[1][type] 页眉if re.search(end_re, end_text) and not [ in end_text:self.all_text[len(self.all_text) - 1][type] 页脚except:print(page.page_number)else:try:first_text str(self.all_text[self.last_num 2][inside])end_text str(self.all_text[len(self.all_text) - 1][inside])if re.search(first_re, first_text) and [ not in end_text:self.all_text[self.last_num 2][type] 页眉if re.search(end_re, end_text) and [ not in end_text:self.all_text[len(self.all_text) - 1][type] 页脚except:print(page.page_number)self.last_num len(self.all_text) - 1def process_pdf(self):for i in range(len(self.pdf.pages)):self.extract_text_and_tables(self.pdf.pages[i])def save_all_text(self, path):with open(path, w, encodingutf-8) as file:for key in self.all_text.keys():file.write(json.dumps(self.all_text[key], ensure_asciiFalse) \n)def process_all_pdfs_in_folder(folder_path):file_paths glob.glob(f{folder_path}/*)file_paths sorted(file_paths, reverseTrue)for file_path in file_paths:print(file_path)try:processor PDFProcessor(file_path)processor.process_pdf()save_path RAG_ASMPLE_DATAS_TXTS/ file_path.split(/)[-1].replace(.pdf, .txt)processor.save_all_text(save_path)except:print(check)if __name__ __main__:# 需要解析的pdf文件路径pdf_path rC:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.pdf# pdf解析后的txt内容文件out_path rC:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.txtprocessor PDFProcessor(pdf_path)processor.process_pdf()processor.save_all_text(out_path)参考
版面分析–PDF解析神器pdfplumber 版面分析–富文本txt读取
补充 提取PDF中的图片并保存到本地
import pdfplumber
file_name **.pdf# 需要解析的pdf的文件路径
output_file **.xlsx # pdf解析后的内容with pdfplumber.open(file_name) as pdf:#获取第一页first_page pdf.pages[1]print(页码, first_page.page_number)print(page width:, first_page.width)print(page height:, first_page.height)# get the first page texttext first_page.extract_text()print(text)# 获取第一页图片获取到的是一个列表列表中存储的是字典imgs first_page.imagesi 0for img in imgs:# 获取图片的二进制流print(img[stream].get_data())with open(output_file, modewb) as f2:f2.write(img[stream].get_data())提取pdf 表格文本保存为excel文件
import pdfplumber
from openpyxl import Workbook# 保存表格需要安装openpyxl
file_name **.pdf
output_file **.xlsx
with pdfplumber.open(file_name) as pdf:page01 pdf.pages[0]table page01.extract_table()workbook Workbook()sheet workbook.activefor row in table:sheet.append(row)workbook.save(filenameoutput_file)提取PDF表格 文本
import pdfplumber
file_name **.pdf
output_file **.txt
with pdfplumber.open(file_name) as p:page_count len(p.pages)# 统计文档的页数for i in range(0, page_count):page p.pages[i]# 提取每页的对象并存储textdata page.extract_table()#提取每页的表格文字信息# table2 page01.extract_tables()# 提取多个表格data open(output_file , a) # 将 表格文字存放在需要存储的文档里面data.write(textdata )# 文档内容写入提取PDF纯文本
import pdfplumber
file_name **.pdf
output_file **.txt
with pdfplumber.open(file_name) as p:page_count len(p.pages)# 统计文档的页数for i in range(0, page_count):page p.pages[i]# 提取每页的对象并存储textdata page.extract_text()#提取每页的文字信息data open(output_file , a) # 将 表格文字存放在需要存储的文档里面data.write(textdata )# 文档内容写入
读取富文本txt python 读取文件函数有三种 read()、readline()、readlines()
read() 一次性读取所有文本readline() 读取第一行的内容readlines() 读取全部内容以数列的格式返回
with open(rag_datas/story.txt, r, encodingutf-8 ) as f:data f.read()print(data)with open(rag_datas/story.txt, r, encodingutf-8 ) as f:data f.readline()print(data)with open(rag_datas/story.txt, r, encodingutf-8 ) as f:for line in f.readlines():line line.strip(\n)print(line)