Estrai immagini usando iTextSharp
-
03-07-2019 - |
Domanda
Ho usato questo codice con grande successo per estrarre la prima immagine trovata in ogni pagina di un PDF. Tuttavia, ora non funziona con alcuni nuovi PDF per una ragione sconosciuta. Ho usato altri strumenti (Datalogics, ecc.) Che tirano fuori le immagini bene con questi nuovi PDF. Tuttavia, non voglio acquistare Datalogics o altri strumenti se posso usare iTextSharp. Qualcuno può dirmi perché questo codice non trova le immagini nel PDF?
Conosciuti: i miei PDF hanno solo 1 immagine per pagina e nient'altro.
using iTextSharp.text;
using iTextSharp.text.pdf;
...
public static void ExtractImagesFromPDF(string sourcePdf, string outputPath)
{
// NOTE: This will only get the first image it finds per page.
PdfReader pdf = new PdfReader(sourcePdf);
RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);
try
{
for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
{
PdfDictionary pg = pdf.GetPageN(pageNumber);
PdfDictionary res = (PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj = (PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
PdfName type = (PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
if (PdfName.IMAGE.Equals(type))
{
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if ((bytes != null))
{
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
{
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
// must save the file while stream is open.
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNumber));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
System.Drawing.Imaging.ImageCodecInfo jpegEncoder = Utilities.GetImageEncoder("JPEG");
img.Save(path, jpegEncoder, parms);
break;
}
}
}
}
}
}
}
}
catch
{
throw;
}
finally
{
pdf.Close();
raf.Close();
}
}
Soluzione
Ho scoperto che il mio problema era che non cercavo ricorsivamente immagini all'interno di moduli e gruppi. Fondamentalmente, il codice originale avrebbe trovato solo immagini che erano incorporate nella radice del documento pdf. Ecco il metodo rivisto più un nuovo metodo (FindImageInPDFDictionary) che cerca ricorsivamente le immagini nella pagina. NOTA: i difetti di supportare solo immagini JPEG e non compresse sono ancora validi. Vedi il codice di R Ubben per le opzioni per correggere quei difetti. HTH qualcuno.
public static void ExtractImagesFromPDF(string sourcePdf, string outputPath)
{
// NOTE: This will only get the first image it finds per page.
PdfReader pdf = new PdfReader(sourcePdf);
RandomAccessFileOrArray raf = new iTextSharp.text.pdf.RandomAccessFileOrArray(sourcePdf);
try
{
for (int pageNumber = 1; pageNumber <= pdf.NumberOfPages; pageNumber++)
{
PdfDictionary pg = pdf.GetPageN(pageNumber);
// recursively search pages, forms and groups for images.
PdfObject obj = FindImageInPDFDictionary(pg);
if (obj != null)
{
int XrefIndex = Convert.ToInt32(((PRIndirectReference)obj).Number.ToString(System.Globalization.CultureInfo.InvariantCulture));
PdfObject pdfObj = pdf.GetPdfObject(XrefIndex);
PdfStream pdfStrem = (PdfStream)pdfObj;
byte[] bytes = PdfReader.GetStreamBytesRaw((PRStream)pdfStrem);
if ((bytes != null))
{
using (System.IO.MemoryStream memStream = new System.IO.MemoryStream(bytes))
{
memStream.Position = 0;
System.Drawing.Image img = System.Drawing.Image.FromStream(memStream);
// must save the file while stream is open.
if (!Directory.Exists(outputPath))
Directory.CreateDirectory(outputPath);
string path = Path.Combine(outputPath, String.Format(@"{0}.jpg", pageNumber));
System.Drawing.Imaging.EncoderParameters parms = new System.Drawing.Imaging.EncoderParameters(1);
parms.Param[0] = new System.Drawing.Imaging.EncoderParameter(System.Drawing.Imaging.Encoder.Compression, 0);
System.Drawing.Imaging.ImageCodecInfo jpegEncoder = Utilities.GetImageEncoder("JPEG");
img.Save(path, jpegEncoder, parms);
}
}
}
}
}
catch
{
throw;
}
finally
{
pdf.Close();
raf.Close();
}
}
private static PdfObject FindImageInPDFDictionary(PdfDictionary pg)
{
PdfDictionary res =
(PdfDictionary)PdfReader.GetPdfObject(pg.Get(PdfName.RESOURCES));
PdfDictionary xobj =
(PdfDictionary)PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)PdfReader.GetPdfObject(obj);
PdfName type =
(PdfName)PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE));
//image at the root of the pdf
if (PdfName.IMAGE.Equals(type))
{
return obj;
}// image inside a form
else if (PdfName.FORM.Equals(type))
{
return FindImageInPDFDictionary(tg);
} //image inside a group
else if (PdfName.GROUP.Equals(type))
{
return FindImageInPDFDictionary(tg);
}
}
}
}
return null;
}
Altri suggerimenti
Ecco una soluzione più semplice:
iTextSharp.text.pdf.parser.PdfImageObject pdfImage =
new iTextSharp.text.pdf.parser.PdfImageObject(imgPRStream);
System.Drawing.Image img = pdfImage.GetDrawingImage();
Il codice seguente incorpora tutte le idee di Dave e R Ubben sopra, inoltre restituisce un elenco completo di tutte le immagini e si occupa anche di più profondità di bit. Ho dovuto convertirlo in VB per il progetto a cui sto lavorando, mi dispiace per quello ...
Private Sub getAllImages(ByVal dict As pdf.PdfDictionary, ByVal images As List(Of Byte()), ByVal doc As pdf.PdfReader)
Dim res As pdf.PdfDictionary = CType(pdf.PdfReader.GetPdfObject(dict.Get(pdf.PdfName.RESOURCES)), pdf.PdfDictionary)
Dim xobj As pdf.PdfDictionary = CType(pdf.PdfReader.GetPdfObject(res.Get(pdf.PdfName.XOBJECT)), pdf.PdfDictionary)
If xobj IsNot Nothing Then
For Each name As pdf.PdfName In xobj.Keys
Dim obj As pdf.PdfObject = xobj.Get(name)
If (obj.IsIndirect) Then
Dim tg As pdf.PdfDictionary = CType(pdf.PdfReader.GetPdfObject(obj), pdf.PdfDictionary)
Dim subtype As pdf.PdfName = CType(pdf.PdfReader.GetPdfObject(tg.Get(pdf.PdfName.SUBTYPE)), pdf.PdfName)
If pdf.PdfName.IMAGE.Equals(subtype) Then
Dim xrefIdx As Integer = CType(obj, pdf.PRIndirectReference).Number
Dim pdfObj As pdf.PdfObject = doc.GetPdfObject(xrefIdx)
Dim str As pdf.PdfStream = CType(pdfObj, pdf.PdfStream)
Dim bytes As Byte() = pdf.PdfReader.GetStreamBytesRaw(CType(str, pdf.PRStream))
Dim filter As String = tg.Get(pdf.PdfName.FILTER).ToString
Dim width As String = tg.Get(pdf.PdfName.WIDTH).ToString
Dim height As String = tg.Get(pdf.PdfName.HEIGHT).ToString
Dim bpp As String = tg.Get(pdf.PdfName.BITSPERCOMPONENT).ToString
If filter = "/FlateDecode" Then
bytes = pdf.PdfReader.FlateDecode(bytes, True)
Dim pixelFormat As System.Drawing.Imaging.PixelFormat
Select Case Integer.Parse(bpp)
Case 1
pixelFormat = Drawing.Imaging.PixelFormat.Format1bppIndexed
Case 24
pixelFormat = Drawing.Imaging.PixelFormat.Format24bppRgb
Case Else
Throw New Exception("Unknown pixel format " + bpp)
End Select
Dim bmp As New System.Drawing.Bitmap(Int32.Parse(width), Int32.Parse(height), pixelFormat)
Dim bmd As System.Drawing.Imaging.BitmapData = bmp.LockBits(New System.Drawing.Rectangle(0, 0, Int32.Parse(width), Int32.Parse(height)), System.Drawing.Imaging.ImageLockMode.WriteOnly, pixelFormat)
Marshal.Copy(bytes, 0, bmd.Scan0, bytes.Length)
bmp.UnlockBits(bmd)
Using ms As New MemoryStream
bmp.Save(ms, System.Drawing.Imaging.ImageFormat.Png)
bytes = ms.GetBuffer
End Using
End If
images.Add(bytes)
ElseIf pdf.PdfName.FORM.Equals(subtype) Or pdf.PdfName.GROUP.Equals(subtype) Then
getAllImages(tg, images, doc)
End If
End If
Next
End If
End Sub
Dalla versione C #:
private IList<System.Drawing.Image> GetImagesFromPdfDict(PdfDictionary dict, PdfReader doc){
List<System.Drawing.Image> images = new List<System.Drawing.Image>();
PdfDictionary res = (PdfDictionary)(PdfReader.GetPdfObject(dict.Get(PdfName.RESOURCES)));
PdfDictionary xobj = (PdfDictionary)(PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)(PdfReader.GetPdfObject(obj));
pdf.PdfName subtype = (pdf.PdfName)(pdf.PdfReader.GetPdfObject(tg.Get(pdf.PdfName.SUBTYPE)));
if (pdf.PdfName.IMAGE.Equals(subtype))
{
int xrefIdx = ((pdf.PRIndirectReference)obj).Number;
pdf.PdfObject pdfObj = doc.GetPdfObject(xrefIdx);
pdf.PdfStream str = (pdf.PdfStream)(pdfObj);
byte[] bytes = pdf.PdfReader.GetStreamBytesRaw((pdf.PRStream)str);
string filter = tg.Get(pdf.PdfName.FILTER).ToString();
string width = tg.Get(pdf.PdfName.WIDTH).ToString();
string height = tg.Get(pdf.PdfName.HEIGHT).ToString();
string bpp = tg.Get(pdf.PdfName.BITSPERCOMPONENT).ToString();
if (filter == "/FlateDecode")
{
bytes = pdf.PdfReader.FlateDecode(bytes, true);
System.Drawing.Imaging.PixelFormat pixelFormat;
switch (int.Parse(bpp))
{
case 1:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format1bppIndexed;
break;
case 24:
pixelFormat = System.Drawing.Imaging.PixelFormat.Format24bppRgb;
break;
default:
throw new Exception("Unknown pixel format " + bpp);
}
var bmp = new System.Drawing.Bitmap(Int32.Parse(width), Int32.Parse(height), pixelFormat);
System.Drawing.Imaging.BitmapData bmd = bmp.LockBits(new System.Drawing.Rectangle(0, 0, Int32.Parse(width),
Int32.Parse(height)), System.Drawing.Imaging.ImageLockMode.WriteOnly, pixelFormat);
Marshal.Copy(bytes, 0, bmd.Scan0, bytes.Length);
bmp.UnlockBits(bmd);
using (var ms = new MemoryStream())
{
bmp.Save(ms, System.Drawing.Imaging.ImageFormat.Png);
bytes = ms.GetBuffer();
}
}
images.Add(System.Drawing.Image.FromStream(new MemoryStream(bytes)));
}
else if (pdf.PdfName.FORM.Equals(subtype) || pdf.PdfName.GROUP.Equals(subtype))
{
images.AddRange(GetImagesFromPdfDict(tg, doc));
}
}
}
}
return images;
}
Questa è solo un'altra rehash delle idee degli altri, ma quella che ha funzionato per me. Qui uso lo snippet di cattura delle immagini di @ Malco con il looping di R Ubben:
private IList<System.Drawing.Image> GetImagesFromPdfDict(PdfDictionary dict, PdfReader doc)
{
List<System.Drawing.Image> images = new List<System.Drawing.Image>();
PdfDictionary res = (PdfDictionary)(PdfReader.GetPdfObject(dict.Get(PdfName.RESOURCES)));
PdfDictionary xobj = (PdfDictionary)(PdfReader.GetPdfObject(res.Get(PdfName.XOBJECT)));
if (xobj != null)
{
foreach (PdfName name in xobj.Keys)
{
PdfObject obj = xobj.Get(name);
if (obj.IsIndirect())
{
PdfDictionary tg = (PdfDictionary)(PdfReader.GetPdfObject(obj));
PdfName subtype = (PdfName)(PdfReader.GetPdfObject(tg.Get(PdfName.SUBTYPE)));
if (PdfName.IMAGE.Equals(subtype))
{
int xrefIdx = ((PRIndirectReference)obj).Number;
PdfObject pdfObj = doc.GetPdfObject(xrefIdx);
PdfStream str = (PdfStream)(pdfObj);
iTextSharp.text.pdf.parser.PdfImageObject pdfImage =
new iTextSharp.text.pdf.parser.PdfImageObject((PRStream)str);
System.Drawing.Image img = pdfImage.GetDrawingImage();
images.Add(img);
}
else if (PdfName.FORM.Equals(subtype) || PdfName.GROUP.Equals(subtype))
{
images.AddRange(GetImagesFromPdfDict(tg, doc));
}
}
}
}
return images;
}
Quanto sopra funzionerà solo con JPEG. Escludendo le immagini in linea e i file incorporati, è necessario passare attraverso gli oggetti del sottotipo IMMAGINE, quindi guardare il filtro e intraprendere le azioni appropriate. Ecco un esempio, supponendo che abbiamo un PdfObject del sottotipo IMMAGINE:
PdfReader pdf = new PdfReader("c:\\temp\\exp0.pdf");
int xo=pdf.XrefSize;
for (int i=0;i<xo;i++)
{
PdfObject obj=pdf.GetPdfObject(i);
if (obj!=null && obj.IsStream())
{
PdfDictionary pd=(PdfDictionary)obj;
if (pd.Contains(PdfName.SUBTYPE) && pd.Get(PdfName.SUBTYPE).ToString()=="/Image")
{
string filter=pd.Get(PdfName.FILTER).ToString();
string width=pd.Get(PdfName.WIDTH).ToString();
string height=pd.Get(PdfName.HEIGHT).ToString();
string bpp=pd.Get(PdfName.BITSPERCOMPONENT).ToString();
string extent=".";
byte [] img=null;
switch (filter)
{
case "/FlateDecode":
byte[] arr=PdfReader.FlateDecode(PdfReader.GetStreamBytesRaw((PRStream)obj),true);
Bitmap bmp=new Bitmap(Int32.Parse(width),Int32.Parse(height),PixelFormat.Format24bppRgb);
BitmapData bmd=bmp.LockBits(new Rectangle(0,0,Int32.Parse(width),Int32.Parse(height)),ImageLockMode.WriteOnly,
PixelFormat.Format24bppRgb);
Marshal.Copy(arr,0,bmd.Scan0,arr.Length);
bmp.UnlockBits(bmd);
bmp.Save("c:\\temp\\bmp1.png",ImageFormat.Png);
break;
default:
break;
}
}
}
}
Questo ovviamente rovinerà il colore a causa del BGR di Microsoft, ovviamente, ma volevo mantenerlo breve. Fai qualcosa di simile per " / CCITTFaxDecode " ;, ecc.