当前位置: 首页 > news >正文

网站建设租用服务器网店如何做推广

网站建设租用服务器,网店如何做推广,制作公司网站要多少费用呢,句容网站建设pdfplumber 的特点 1、它是一个纯 python 第三方库,适合 python 3.x 版本 2、它用来查看pdf各类信息,能有效提取文本、表格 3、它不支持修改或生成pdf,也不支持对pdf扫描件的处理 import glob import pdfplumber import re from collection…

pdfplumber 的特点

1、它是一个纯 python 第三方库,适合 python 3.x 版本
2、它用来查看pdf各类信息,能有效提取文本、表格
3、它不支持修改或生成pdf,也不支持对pdf扫描件的处理

import glob
import pdfplumber
import re
from collections import defaultdict
import jsonclass PDFProcessor:def __init__(self, filepath):self.filepath = filepath#打开文档,注意存放的位置self.pdf = pdfplumber.open(filepath)self.all_text = defaultdict(dict)self.allrow = 0self.last_num = 0def check_lines(self, page, top, buttom):# 文本数据lines = page.extract_words()[::]text = ''last_top = 0last_check = 0for l in range(len(lines)):each_line = lines[l]check_re = '(?:。|;|单位:元|单位:万元|币种:人民币|\d|报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$'if top == '' and buttom == '':if abs(last_top - each_line['top']) <= 2:text = text + each_line['text']#elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re, text):elif last_check > 0 and (page.height * 0.9 - each_line['top']) > 0 and not re.search(check_re, text):text = text + each_line['text']else:text = text + '\n' + each_line['text']elif top == '':if each_line['top'] > buttom:if abs(last_top - each_line['top']) <= 2:text = text + each_line['text']elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re,text):text = text + each_line['text']else:text = text + '\n' + each_line['text']else:if each_line['top'] < top and each_line['top'] > buttom:if abs(last_top - each_line['top']) <= 2:text = text + each_line['text']elif last_check > 0 and (page.height * 0.85 - each_line['top']) > 0 and not re.search(check_re,text):text = text + each_line['text']else:text = text + '\n' + each_line['text']last_top = each_line['top']last_check = each_line['x1'] - page.width * 0.85return textdef drop_empty_cols(self, data):# 删除所有列为空数据的列transposed_data = list(map(list, zip(*data)))filtered_data = [col for col in transposed_data if not all(cell is '' for cell in col)]result = list(map(list, zip(*filtered_data)))return result@staticmethoddef keep_visible_lines(obj):"""If the object is a ``rect`` type, keep it only if the lines are visible.A visible line is the one having ``non_stroking_color`` not null."""if obj['object_type'] == 'rect':if obj['non_stroking_color'] is None:return Falseif obj['width'] < 1 and obj['height'] < 1:return False# return obj['width'] >= 1 and obj['height'] >= 1 and obj['non_stroking_color'] is not Noneif obj['object_type'] == 'char':return obj['stroking_color'] is not None and obj['non_stroking_color'] is not Nonereturn Truedef extract_text_and_tables(self, page):buttom = 0page = page.filter(self.keep_visible_lines)tables = page.find_tables()if len(tables) >= 1:# 表格数据count = len(tables)for table in tables:if table.bbox[3] < buttom:passelse:count -= 1top = table.bbox[1]text = self.check_lines(page, top, buttom)text_list = text.split('\n')for _t in range(len(text_list)):self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,'type': 'text', 'inside': text_list[_t]}self.allrow += 1buttom = table.bbox[3]new_table = table.extract()r_count = 0for r in range(len(new_table)):row = new_table[r]if row[0] is None:r_count += 1for c in range(len(row)):if row[c] is not None and row[c] not in ['', ' ']:if new_table[r - r_count][c] is None:new_table[r - r_count][c] = row[c]else:new_table[r - r_count][c] += row[c]new_table[r][c] = Noneelse:r_count = 0end_table = []for row in new_table:if row[0] != None:cell_list = []cell_check = Falsefor cell in row:if cell != None:cell = cell.replace('\n', '')else:cell = ''if cell != '':cell_check = Truecell_list.append(cell)if cell_check == True:end_table.append(cell_list)end_table = self.drop_empty_cols(end_table)for row in end_table:self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,'type': 'excel', 'inside': str(row)}# self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow, 'type': 'excel',#                               'inside': ' '.join(row)}self.allrow += 1if count == 0:text = self.check_lines(page, '', buttom)text_list = text.split('\n')for _t in range(len(text_list)):self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,'type': 'text', 'inside': text_list[_t]}self.allrow += 1else:#文本数据text = self.check_lines(page, '', '')text_list = text.split('\n')for _t in range(len(text_list)):self.all_text[self.allrow] = {'page': page.page_number, 'allrow': self.allrow,'type': 'text', 'inside': text_list[_t]}self.allrow += 1first_re = '[^计](?:报告(?:全文)?(?:(修订版)|(修订稿)|(更正后))?)$'end_re = '^(?:\d|\\|\/|第|共|页|-|_| ){1,}'if self.last_num == 0:try:first_text = str(self.all_text[1]['inside'])end_text = str(self.all_text[len(self.all_text) - 1]['inside'])if re.search(first_re, first_text) and not '[' in end_text:self.all_text[1]['type'] = '页眉'if re.search(end_re, end_text) and not '[' in end_text:self.all_text[len(self.all_text) - 1]['type'] = '页脚'except:print(page.page_number)else:try:first_text = str(self.all_text[self.last_num + 2]['inside'])end_text = str(self.all_text[len(self.all_text) - 1]['inside'])if re.search(first_re, first_text) and '[' not in end_text:self.all_text[self.last_num + 2]['type'] = '页眉'if re.search(end_re, end_text) and '[' not in end_text:self.all_text[len(self.all_text) - 1]['type'] = '页脚'except:print(page.page_number)self.last_num = len(self.all_text) - 1def process_pdf(self):for i in range(len(self.pdf.pages)):self.extract_text_and_tables(self.pdf.pages[i])def save_all_text(self, path):with open(path, 'w', encoding='utf-8') as file:for key in self.all_text.keys():file.write(json.dumps(self.all_text[key], ensure_ascii=False) + '\n')def process_all_pdfs_in_folder(folder_path):file_paths = glob.glob(f'{folder_path}/*')file_paths = sorted(file_paths, reverse=True)for file_path in file_paths:print(file_path)try:processor = PDFProcessor(file_path)processor.process_pdf()save_path = 'RAG_ASMPLE_DATAS_TXTS/' + file_path.split('/')[-1].replace('.pdf', '.txt')processor.save_all_text(save_path)except:print('check')if __name__ == '__main__':# 需要解析的pdf文件路径pdf_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.pdf'# pdf解析后的txt内容文件out_path = r'C:\Users\WWS\RAG_ASMPLE_DATAS\2020-02-26__上海爱旭新能源股份有限公司__600732__爱旭股份__2019年__年度报告.txt'processor = PDFProcessor(pdf_path)processor.process_pdf()processor.save_all_text(out_path)

参考

版面分析–PDF解析神器pdfplumber
版面分析–富文本txt读取

补充


提取PDF中的图片并保存到本地

import pdfplumber
file_name = "**.pdf"# 需要解析的pdf的文件路径
output_file = "**.xlsx" # pdf解析后的内容with pdfplumber.open(file_name) as pdf:#获取第一页first_page = pdf.pages[1]print('页码:', first_page.page_number)print('page width:', first_page.width)print('page height:', first_page.height)# get the first page texttext = first_page.extract_text()print(text)# 获取第一页图片,获取到的是一个列表,列表中存储的是字典imgs = first_page.imagesi = 0for img in imgs:# 获取图片的二进制流print(img['stream'].get_data())with open(output_file, mode='wb') as f2:f2.write(img['stream'].get_data())

提取pdf 表格文本,保存为excel文件

import pdfplumber
from openpyxl import Workbook# 保存表格,需要安装openpyxl
file_name = '**.pdf'
output_file = '**.xlsx'
with pdfplumber.open(file_name) as pdf:page01 = pdf.pages[0]table = page01.extract_table()workbook = Workbook()sheet = workbook.activefor row in table:sheet.append(row)workbook.save(filename=output_file)

提取PDF表格 文本

import pdfplumber
file_name = '**.pdf'
output_file = '**.txt'
with pdfplumber.open(file_name) as p:page_count = len(p.pages)# 统计文档的页数for i in range(0, page_count):page = p.pages[i]# 提取每页的对象并存储textdata = page.extract_table()#提取每页的表格文字信息# table2 = page01.extract_tables()# 提取多个表格data = open(output_file , 'a') # 将 表格文字存放在需要存储的文档里面data.write(textdata )# 文档内容写入

提取PDF纯文本

import pdfplumber
file_name = '**.pdf'
output_file = '**.txt'
with pdfplumber.open(file_name) as p:page_count = len(p.pages)# 统计文档的页数for i in range(0, page_count):page = p.pages[i]# 提取每页的对象并存储textdata = page.extract_text()#提取每页的文字信息data = open(output_file , 'a') # 将 表格文字存放在需要存储的文档里面data.write(textdata )# 文档内容写入

读取富文本txt
python 读取文件函数有三种 read()、readline()、readlines()

  • read() 一次性读取所有文本
  • readline() 读取第一行的内容
  • readlines() 读取全部内容,以数列的格式返回
with open('rag_datas/story.txt', 'r', encoding='utf-8' ) as f:data = f.read()print(data)
with open('rag_datas/story.txt', 'r', encoding='utf-8' ) as f:data = f.readline()print(data)
with open('rag_datas/story.txt', 'r', encoding='utf-8' ) as f:for line in f.readlines():line = line.strip('\n')print(line)
http://www.hkea.cn/news/219996/

相关文章:

  • 厦门做网站seo的seo服务公司招聘
  • 安徽池州做企业网站百度搜索官方网站
  • 芜湖商城网站建设青岛百度快速优化排名
  • 我找伟宏篷布我做的事ko家的网站seoul怎么读
  • 即墨做网站优书网首页
  • 网站建设实践报告3000字放单平台
  • 中华人民共和国城乡住房建设厅网站seo技术外包
  • 网站做销售是斤么工作东莞网站营销推广
  • 做网站现在还行吗宁德市疫情
  • 响应式网站首页百度搜索资源
  • 工人找工作哪个网站好福州百度seo
  • 台湾做甜品的网站谷歌seo关键词排名优化
  • 织梦网站导入链接怎么做谷歌广告投放
  • 沈阳网站哪家公司做的好镇江关键字优化品牌
  • 台州本地做网站的做引流推广的平台600
  • 网站的导航用css怎么做网站外链查询
  • 青岛模版网站建设关键词优化按天计费
  • 高端网站建设服务器seo服务哪家好
  • 服装网站建设分析网站浏览器
  • 建站城企业邮箱怎么开通注册
  • html做动态网站cms
  • 一个网站建设需要多少钱百度seo排名优化公司
  • 网站做app的软件友博国际个人中心登录
  • 做网站用什么代码编写可口可乐软文营销案例
  • 宜昌网站建设哪家好厦门百度广告开户
  • 网站做二级域名外链
  • 网站建设服务费属于哪个大类电商seo搜索优化
  • 12380网站建设情况的报告网络seo首页
  • 个人如何在百度上做广告网站seo什么意思
  • java做网站编程合肥seo快排扣费