使用ITextSharp提取和更新现有PDF中的链接

Z时代
2024-01-10
分类：问答

我需要在网络上发布几个（阅读很多）PDF文件，但是其中许多都有硬编码的file：//链接和指向非公共位置的链接。我需要通读这些PDF并将链接更新到正确的位置。我已经开始使用itextsharp编写应用程序以读取目录和文件，查找PDF并遍历每个页面。接下来，我需要找到链接，然后更新不正确的链接。

string path = "c:\\html";
DirectoryInfo rootFolder = new DirectoryInfo(path);
foreach (DirectoryInfo di in rootFolder.GetDirectories())
{
    // get pdf
    foreach (FileInfo pdf in di.GetFiles("*.pdf"))
    {
        string contents = string.Empty;
        Document doc = new Document();
        PdfReader reader = new PdfReader(pdf.FullName);
        using (MemoryStream ms = new MemoryStream())
        {
            PdfWriter writer = PdfWriter.GetInstance(doc, ms);
            doc.Open();
            for (int p = 1; p <= reader.NumberOfPages; p++)
            {
                byte[] bt = reader.GetPageContent(p);
            }
        }
    }
}

坦率地说，一旦我获得了页面内容，当我涉及到iTextSharp时，我就会对此一无所知。我已经阅读了sourceforge上的itextsharp示例，但实际上并没有找到我想要的。

任何帮助将不胜感激。

谢谢。

回答：

如果您不了解PDF格式的内部内容以及iText /

iTextSharp对其的抽象/实现，则此操作会有些复杂。您需要了解如何使用PdfDictionary对象并通过其PdfName键查找事物。一旦获得，您就可以阅读正式的PDF规范并轻松浏览文档。如果您愿意，我会在适用的地方在括号中包括PDF规范的相关部分。

无论如何，PDF中的链接都存储为注释（PDF Ref

12.5）。注释是基于页面的，因此您首先需要单独获取每个页面的注释数组。注释有很多不同的类型，因此您需要检查每个注释，SUBTYPE并查看其是否设置为LINK（12.5.6.5）。每个链接

都应

有一个ACTION与之关联的字典（12.6.2），您想检查操作的S键以查看它是什么类型的操作。有很多可能的链接，具体来说，链接可能是内部链接或打开文件链接或播放声音链接或其他内容（12.6.4.1）。您只在寻找类型的链接URI（注意字母I而不是letter

L）。URI操作（12.6.4.7）的URI密钥可保存要导航的实际地址。（还有一个IsMap我无法想象任何人使用的图像映射的属性。）

ew。还在读书吗？以下是基于我在此处发布的针对iTextSharp

5.1.1.0 的完整运行的VS 2010

C＃WinForms应用程序。这段代码主要做两件事：1）创建一个带有指向Google.com的链接的示例PDF，以及2）用指向bing.com的链接替换该链接。该代码应该被很好地注释，但是可以随时问您可能有的任何问题。

using System;
using System.Text;
using System.Windows.Forms;
using iTextSharp.text;
using iTextSharp.text.pdf;
using System.IO;
namespace WindowsFormsApplication1
{
    public partial class Form1 : Form
    {
        //Folder that we are working in
        private static readonly string WorkingFolder = Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Hyperlinked PDFs");
        //Sample PDF
        private static readonly string BaseFile = Path.Combine(WorkingFolder, "OldFile.pdf");
        //Final file
        private static readonly string OutputFile = Path.Combine(WorkingFolder, "NewFile.pdf");
        public Form1()
        {
            InitializeComponent();
        }
        private void Form1_Load(object sender, EventArgs e)
        {
            CreateSamplePdf();
            UpdatePdfLinks();
            this.Close();
        }
        private static void CreateSamplePdf()
        {
            //Create our output directory if it does not exist
            Directory.CreateDirectory(WorkingFolder);
            //Create our sample PDF
            using (iTextSharp.text.Document Doc = new iTextSharp.text.Document(PageSize.LETTER))
            {
                using (FileStream FS = new FileStream(BaseFile, FileMode.Create, FileAccess.Write, FileShare.Read))
                {
                    using (PdfWriter writer = PdfWriter.GetInstance(Doc, FS))
                    {
                        Doc.Open();
                        //Turn our hyperlink blue
                        iTextSharp.text.Font BlueFont = FontFactory.GetFont("Arial", 12, iTextSharp.text.Font.NORMAL, iTextSharp.text.BaseColor.BLUE);
                        Doc.Add(new Paragraph(new Chunk("Go to URL", BlueFont).SetAction(new PdfAction("http://www.google.com/", false))));
                        Doc.Close();
                    }
                }
            }
        }
        private static void UpdatePdfLinks()
        {
            //Setup some variables to be used later
            PdfReader R = default(PdfReader);
            int PageCount = 0;
            PdfDictionary PageDictionary = default(PdfDictionary);
            PdfArray Annots = default(PdfArray);
            //Open our reader
            R = new PdfReader(BaseFile);
            //Get the page cont
            PageCount = R.NumberOfPages;
            //Loop through each page
            for (int i = 1; i <= PageCount; i++)
            {
                //Get the current page
                PageDictionary = R.GetPageN(i);
                //Get all of the annotations for the current page
                Annots = PageDictionary.GetAsArray(PdfName.ANNOTS);
                //Make sure we have something
                if ((Annots == null) || (Annots.Length == 0))
                    continue;
                //Loop through each annotation
                foreach (PdfObject A in Annots.ArrayList)
                {
                    //Convert the itext-specific object as a generic PDF object
                    PdfDictionary AnnotationDictionary = (PdfDictionary)PdfReader.GetPdfObject(A);
                    //Make sure this annotation has a link
                    if (!AnnotationDictionary.Get(PdfName.SUBTYPE).Equals(PdfName.LINK))
                        continue;
                    //Make sure this annotation has an ACTION
                    if (AnnotationDictionary.Get(PdfName.A) == null)
                        continue;
                    //Get the ACTION for the current annotation
                    PdfDictionary AnnotationAction = (PdfDictionary)AnnotationDictionary.Get(PdfName.A);
                    //Test if it is a URI action
                    if (AnnotationAction.Get(PdfName.S).Equals(PdfName.URI))
                    {
                        //Change the URI to something else
                        AnnotationAction.Put(PdfName.URI, new PdfString("http://www.bing.com/"));
                    }
                }
            }
            //Next we create a new document add import each page from the reader above
            using (FileStream FS = new FileStream(OutputFile, FileMode.Create, FileAccess.Write, FileShare.None))
            {
                using (Document Doc = new Document())
                {
                    using (PdfCopy writer = new PdfCopy(Doc, FS))
                    {
                        Doc.Open();
                        for (int i = 1; i <= R.NumberOfPages; i++)
                        {
                            writer.AddPage(writer.GetImportedPage(R, i));
                        }
                        Doc.Close();
                    }
                }
            }
        }
    }
}