Java获取PDF关键字坐标

Z时代
2024-01-10
分类：综合

一、使用 itextpdf


<dependency>
    <groupId>com.itextpdf</groupId>
    <artifactId>itextpdf</artifactId>
    <version>5.5.13.1</version>
</dependency>

PdfKeyWordPosition.java


package com.util;
import com.itextpdf.awt.geom.Rectangle2D;
import com.itextpdf.text.pdf.PdfDictionary;
import com.itextpdf.text.pdf.PdfName;
import com.itextpdf.text.pdf.PdfReader;
import com.itextpdf.text.pdf.parser.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.ByteArrayOutputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
 * 获取pdf关键字坐标
 */
public class PdfKeyWordPosition {
    private static final Logger log = LoggerFactory.getLogger(PdfKeyWordPosition.class);
    /**
     * 获取关键字坐标
     * @param pdfData
     * @param keyWord
     * @return
     */
    public static List<Map<String, Object>> getWordsPcoordinate(byte[] pdfData, String keyWord){
        List<Map<String, Object>> result = new ArrayList<>();
        PdfReader reader = null;
       try {
           // pdfData ：可以是二进制，也可以是文件路径，两种方式选择一种
           reader = new PdfReader(pdfData);
           //获取pdf页数
           int pages = reader.getNumberOfPages();
           for (int pageNum = 1; pageNum <= pages; pageNum++) {
               //每页的宽度
               Float width = reader.getPageSize(pageNum).getWidth();
               //每页的高度
               Float height = reader.getPageSize(pageNum).getHeight();
               RenderListenerHelper renderListenerHelper = new RenderListenerHelper(pageNum, width, height);
               //解析pdf，定位位置
               PdfContentStreamProcessor processor = new PdfContentStreamProcessor(renderListenerHelper);
               PdfDictionary pageDic = reader.getPageN(pageNum);
               PdfDictionary resourcesDic = pageDic.getAsDict(PdfName.RESOURCES);
               processor.processContent(ContentByteUtils.getContentBytesForPage(reader, pageNum), resourcesDic);
               //文本内容
               String content = renderListenerHelper.getContent();
               //文本每个字对应的坐标
               List<Map<String, Object>> charPositions = renderListenerHelper.getCharPositions();
               for (int i = 0; i < content.length(); i++){
                   //获取关键字所在位置
                   int keyIndex = content.indexOf(keyWord, i);
                   if (keyIndex == -1){
                       break;
                   }
                   result.add(charPositions.get(keyIndex));
                   i = keyIndex + 1;
               }
           }
       } catch (Exception e){
           log.error("获取pdf关键字坐标失败：{}", e);
       } finally {
           reader.close();
       }
        return result;
    }
    /**
     * 重写 itextpdf 的 RenderListener 类里的方法
     */
    private static class RenderListenerHelper implements RenderListener {
        private int pageNum;
        private float pageWidth;
        private float pageHeight;
        private StringBuilder contentBuilder = new StringBuilder();
        private List<Map<String, Object>> charPositions = new ArrayList<>();
        public RenderListenerHelper(int pageNum, float pageWidth, float pageHeight) {
            this.pageNum = pageNum;
            this.pageWidth = pageWidth;
            this.pageHeight = pageHeight;
        }
        public String getContent() {
            return contentBuilder.toString();
        }
        public List<Map<String, Object>> getCharPositions() {
            return charPositions;
        }
        //step 2 遇到"BT"执行
        @Override
        public void beginTextBlock() {
        }
        //step 3 文字主要处理方法
        @Override
        public void renderText(TextRenderInfo renderInfo) {
            //获取文本内容每个字信息集合
            List<TextRenderInfo> characterRenderInfos = renderInfo.getCharacterRenderInfos();
            for (TextRenderInfo textRenderInfo : characterRenderInfos) {
                String word = textRenderInfo.getText();
                if (word.length() > 1) {
                    word = word.substring(word.length() - 1);
                }
                Rectangle2D.Float boundingRectange = textRenderInfo.getAscentLine().getBoundingRectange();
                //正常坐标
                Float x = boundingRectange.x;
                Float y = boundingRectange.y;
                /*
                //中心坐标
                float x = (float)boundingRectange.getCenterX();
                float y = (float)boundingRectange.getCenterY();
                //最大最小坐标
                double x = boundingRectange.getMinX();
                double y = boundingRectange.getMaxY();
                //这两个是关键字在所在页面的XY轴的百分比
                float xPercent = Math.round(x / pageWidth * 10000) / 10000f;
                float yPercent = Math.round((1 - y / pageHeight) * 10000) / 10000f;
                 */
                Map<String, Object> coordinate = new HashMap<>();
                coordinate.put("x", boundingRectange.x);
                coordinate.put("y", boundingRectange.y);
                coordinate.put("page", pageNum); //页数
                coordinate.put("fontWidth", boundingRectange.width); //字体长度
                coordinate.put("fontHeight", boundingRectange.height); //字段高度
                charPositions.add(coordinate);
                contentBuilder.append(word);
            }
        }
        //step 4(最后执行的，只执行一次)，遇到“ET”执行
        @Override
        public void endTextBlock() {
        }
        //step 1(图片处理方法)
        @Override
        public void renderImage(ImageRenderInfo renderInfo) {
        }
    }
    public static void main(String[] args) {
        try {
            InputStream is = null;
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            try {
                is = new FileInputStream("D:\test.pdf");
                byte[] buffer = new byte[is.available()];
                Integer n = 0;
                while ((n = is.read(buffer)) != -1) {
                    bos.write(buffer, 0, n);
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    bos.close();
                    if (is != null) {
                        is.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            byte[] bytes = bos.toByteArray();
            List<Map<String,Object>> wordsPcoordinates = getWordsPcoordinate(bytes,"日期");
            for (Map<String, Object> map : wordsPcoordinates){
                System.out.println("x坐标 -> " + map.get("x"));
                System.out.println("y坐标 -> " + map.get("y"));
                System.out.println("页数 -> " + map.get("page"));
                System.out.println("字体长度 -> " + map.get("fontWidth"));
                System.out.println("字段高度 -> " + map.get("fontHeight"));
                System.out.println("");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

二、使用 pdfbox


<dependency>
    <groupId>org.apache.pdfbox</groupId>
    <artifactId>pdfbox</artifactId>
    <version>2.0.20</version>
</dependency>

PdfBoxKeyWordPosition.java


package com.util;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
 * 继承 pdfbox 中 PDFTextStripper类，获取关键字坐标
 */
public class PdfBoxKeyWordPosition extends PDFTextStripper {
    private static final Logger log = LoggerFactory.getLogger(PdfBoxKeyWordPosition.class);
    //关键字字符数组
    private char[] key;
    //PDF文件路径
    private String pdfPath;
    //二进制文件
    private byte[] bytes;
    //坐标集合
    private List<Map<String, Object>> coordinates = new ArrayList<>();
    // 当前页坐标集合
    private List<Map<String, Object>> pageList = new ArrayList<>();
    /*//使用文件路径
    public PdfBoxKeyWordPosition(String keyWords, String pdfPath) throws IOException {
        super();
        super.setSortByPosition(true);
        this.pdfPath = pdfPath;
        char[] key = new char[keyWords.length()];
        for (int i = 0; i < keyWords.length(); i++) {
            key[i] = keyWords.charAt(i);
        }
        this.key = key;
    }*/
    //使用二进制数据
    public PdfBoxKeyWordPosition(String keyWords, byte[] bytes) throws IOException {
        super();
        super.setSortByPosition(true);
        this.bytes = bytes;
        char[] key = new char[keyWords.length()];
        for (int i = 0; i < keyWords.length(); i++) {
            key[i] = keyWords.charAt(i);
        }
        this.key = key;
    }
    // 获取坐标信息
    public List<Map<String, Object>> getCoordinate(){
        try {
           //document = PDDocument.load(new File(pdfPath)); 文件地址
            document = PDDocument.load(bytes); //文件二进制数据
            int pages = document.getNumberOfPages();
            for (int i = 1; i <= pages; i++) {
                super.setSortByPosition(true);
                super.setStartPage(i);
                super.setEndPage(i);
                Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
                super.writeText(document, dummy);
                for (Map<String, Object> li : pageList) {
                    li.put("page", i);
                }
                coordinates.addAll(pageList);
                pageList.clear();
            }
        } catch (Exception e) {
            log.error("获取pdf关键字坐标失败：{}", e);
        } finally {
            pageList.clear();
            try {
                if (document != null) {
                    document.close();
                }
            } catch (IOException e) {
                log.error("关闭文件失败：{}", e);
            }
        }
        return coordinates;
    }
    // 获取坐标信息
    @Override
    protected void writeString(String string, List<TextPosition> textPositions) throws IOException {
        for (int i = 0; i < textPositions.size(); i++) {
            String str = textPositions.get(i).getUnicode();
            //找到 key 中第一位所在位置
            if (str.equals(String.valueOf(key[0]))) {
                int count = 0;
                for (int j = 0; j < key.length; j++) {
                    String s = "";
                    try {
                        s = textPositions.get(i + j).getUnicode();
                    } catch (Exception e) {
                        s = "";
                    }
                    //判断key 中每一位是否和文本中顺序对应，一旦不等说明 关键字与本段落不等，则停止本次循环
                    if (s.equals(String.valueOf(key[j]))) {
                        count++;
                    } else if (count > 0){
                        break;
                    }
                }
                //判断 key 中字 在文本是否连续，是则获取坐标
                if (count == key.length) {
                    Map<String, Object> coordinate = new HashMap<>();
                    TextPosition tp = textPositions.get(i);
                    // X坐标 在这里加上了字体的长度，也可以直接 tp.getX()
                    Float x = tp.getX() + tp.getFontSize();
                    // Y坐标 在这里减去的字体的长度，也可以直接 tp.getPageHeight() - tp.getY()
                    Float y = tp.getPageHeight() - tp.getY() - 4 * tp.getFontSize();
                    coordinate.put("x", x);
                    coordinate.put("y", y);
                    pageList.add(coordinate);
                }
            }
        }
    }
    public static void main(String[] args) {
        try {
            InputStream is = null;
            ByteArrayOutputStream bos = new ByteArrayOutputStream();
            try {
                is = new FileInputStream("D:\test.pdf");
                byte[] buffer = new byte[is.available()];
                Integer n = 0;
                while ((n = is.read(buffer)) != -1) {
                    bos.write(buffer, 0, n);
                }
            } catch (IOException e) {
                e.printStackTrace();
            } finally {
                try {
                    bos.close();
                    if (is != null) {
                        is.close();
                    }
                } catch (IOException e) {
                    e.printStackTrace();
                }
            }
            byte[] bytes = bos.toByteArray();
            PdfBoxKeyWordPosition pdf = new PdfBoxKeyWordPosition("日期", bytes);
            List<Map<String,Object>> wordsPcoordinates = pdf.getCoordinate();
            for (Map<String, Object> map : wordsPcoordinates){
                System.out.println("x坐标 -> " + map.get("x"));
                System.out.println("y坐标 -> " + map.get("y"));
                System.out.println("");
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

以上是 Java获取PDF关键字坐标的全部内容，来源链接： utcz.com/z/517831.html

Java获取PDF关键字坐标

其他人也看了：