Python对pdf中的关键字过滤(pdfminer3k或pdfminer使用)

python

最近在实习,老板一下子发给了我120份研报,然而很多都是没用的。聪明的大脑一定要想办法让电脑帮助自己完成简单的工作!

下面是Python筛选含有“”丙烯“”关键字的程序,由于文件的保密性只能贴出代码。

注意:

pip install pdfminer3k而不是pdfminer
导入的时候名字是pdfminer,原因我才是python版本的问题

# -*- coding: utf-8 -*-

"""

Created on Fri May 10 16:54:16 2019

@author: didi.lv

"""

import os

from io import StringIO

import shutil

# 注意:一定要pip install pdfminer3k 而不是pdfminer

from pdfminer.pdfinterp import PDFResourceManager, process_pdf

from pdfminer.converter import TextConverter

from pdfminer.layout import LAParams

# 读取pdf的函数,返回内容

def readPdf(pdf_file):

rsrcmgr = PDFResourceManager()

retstr = StringIO()

laparams = LAParams()

device = TextConverter(rsrcmgr=rsrcmgr, outfp=retstr, laparams=laparams)

process_pdf(rsrcmgr=rsrcmgr, device=device, fp=pdf_file)

device.close()

content = retstr.getvalue()

retstr.close()

return content

def file_name(file_dir):

names = []

for root, dirs, files in os.walk(file_dir):

names.append(files)

return files

if __name__ == '__main__':

file_dir = r'C:\\Users\didi.lv\Desktop\filenames'

file_names_str = str(file_name(file_dir))

name_ = file_names_str.split('.pdf\', ')

# 简单的check下这个代码的细节,需要理解

name_temp1 = name_[0]

name_[0] = name_temp1[1:]

name_temp2 = name_[-1]

name_[-1] = name_temp2[0:-6]

i = 0

for name_check in name_[48:]:

print('--------------------------------------------------------')

i += 1

print(i)

name_check = name_check[1:]

name_check += '.pdf'

name_check_open = r'C:\\Users\didi.lv\Desktop\filenames' + '\\' + name_check

pdf_file = open(name_check_open, 'rb')

content = readPdf(pdf_file)

if '丙烯' in content:

# 注意这里是从原始位置filenames1复制到目标位置filenames2

file_origin = r'C:\\Users\didi.lv\Desktop\filenames1' + '\\' + name_check

file_target = r'C:\\Users\didi.lv\Desktop\filenames2' + '\\' + name_check

shutil.copyfile(file_origin,file_target)

print('copy No. %d file' %i)

 


原文:https://blog.csdn.net/Eric2016_Lv/article/details/90082280

以上是 Python对pdf中的关键字过滤(pdfminer3k或pdfminer使用) 的全部内容, 来源链接: utcz.com/z/388160.html

回到顶部