Per un C # applicazione web Sono voler indicizzare il testo da PDF, DOC, ecc file memorizzati in un database.

Ho avuto modo di sperimentare con un esempio IFilter sul Codice progetto che funziona alla grande per i file dal file system, ma i miei file sono memorizzati in un database MS-SQL.

Qualcuno può aiutarmi a trovare un campione per il testo estratto file memorizzati in un database o avere un'idea su come modificare il codice di progetto di lavorare con un database al posto del file system?

Finalmente dopo molte ore ho capito come fare questo lavoro! Avevo bisogno di correre IFilter sul contenuto PDF memorizzati in un database e ho voluto evitare di salvare i dati in file temporanei.

Per prima cosa ho cercato di usare il BindIFilterFromStream API per creare un IFilter per i contenuti memorizzati in un ruscello, ma (non almeno per questo scenario) sembra che non funziona correttamente. Quindi non andare in quel modo.

Invece è necessario creare un IFilter per un'estensione di file (o accedervi qualche altro modo). Poi si può accedere al IPersistStream COM interfaccia e utilizzarlo per caricare il contenuto PDF in IFilter. Il resto funziona nello stesso modo per i file. Si noti tuttavia che l'IPersistStream API non può essere implementato da ogni IFilter. Si lavora per Adobe PDF IFilter però.

Il codice dovrebbe essere simile a questo (ho tolto un po 'di codice di ritorno il controllo per rendere il codice più leggibile, tuttavia, si dovrebbe verificare tutti i possibili codici di ritorno).

private string ParseIFilter(Stream s)
   // Get an IFilter for a file or file extension
   IFilter filter = null;
   FilterReturnCodes result = NativeMethods.LoadIFilter(".pdf", null, ref filter);
   if (result != FilterReturnCodes.S_OK)

   // Copy the content to global memory
   byte[] buffer = new byte[s.Length];
   s.Read(buffer, 0, buffer.Length);
   IntPtr nativePtr = Marshal.AllocHGlobal(buffer.Length);
   Marshal.Copy(buffer, 0, nativePtr, buffer.Length);

   // Create a COM stream
   System.Runtime.InteropServices.ComTypes.IStream comStream;
   NativeMethods.CreateStreamOnHGlobal(nativePtr, true, out comStream);

   // Load the contents to the iFilter using IPersistStream interface
   var persistStream = (IPersistStream)filter;

   // Initialize iFilter
   FilterFlags filterFlags;
   FilterReturnCodes result = filter.Init(
      FilterInit.IFILTER_INIT_INDEXING_ONLY, 0, IntPtr.Zero, out filterFlags);

   return ExtractTextFromIFilter(filter);

estrazione del testo dagli sguardi di filtro come questo nel mio codice. Ci sono molti esempi di questo sul web e che possono essere implementate in molti modi a seconda di cosa avete bisogno.

private string ExtractTextFromIFilter(IFilter filter)
   var sb = new StringBuilder();

   while (true)
      StatChunk chunk;
      result = filter.GetChunk(out chunk);

      if (result == FilterReturnCodes.S_OK)
         if (chunk.flags == ChunkState.CHUNK_TEXT)
            sb.Append(ExtractTextFromChunk(filter, chunk));


      if (result == FilterReturnCodes.FILTER_E_END_OF_CHUNKS)
         return sb.ToString();


private virtual string ExtractTextFromChunk(IFilter filter, StatChunk chunk)
   var sb = new StringBuilder();

   var result = FilterReturnCodes.S_OK;
   while (result == FilterReturnCodes.S_OK)
      int sizeBuffer = 16384;
      var buffer = new StringBuilder(sizeBuffer);
      result = filter.GetText(ref sizeBuffer, buffer);

      if ((result == FilterReturnCodes.S_OK) || (result == FilterReturnCodes.FILTER_S_LAST_TEXT))
         if((sizeBuffer > 0) && (buffer.Length > 0))
            sb.Append(buffer.ToString(0, sizeBuffer));

      if (result == FilterReturnCodes.FILTER_E_NO_TEXT)
         return string.Empty;

      if ((result == FilterReturnCodes.FILTER_S_LAST_TEXT) || (result == FilterReturnCodes.FILTER_E_NO_MORE_TEXT))
         return sb.ToString();

   return sb.ToString();

E qui ci sono le definizioni di metodi nativi e le strutture utilizzate da loro.

internal static class NativeMethods
    [DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
    public static extern FilterReturnCodes LoadIFilter(
        string pwcsPath,
        [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter,
        ref IFilter ppIUnk);

    public static extern int CreateStreamOnHGlobal(IntPtr hGlobal, bool fDeleteOnRelease, out IStream ppstm);

[ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
public interface IFilter
    FilterReturnCodes Init(FilterInit grfFlags, int cAttributes, IntPtr aAttributes, out FilterFlags pdwFlags);

    FilterReturnCodes GetChunk(out StatChunk pStat);

    FilterReturnCodes GetText(
        ref int pcwcBuffer,
        [Out, MarshalAs(UnmanagedType.LPWStr)] StringBuilder awcBuffer);

    FilterReturnCodes GetValue(ref IntPtr propVal);

    FilterReturnCodes BindRegion(ref FilterRegion origPos, ref Guid riid, ref object ppunk);

[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("0000010c-0000-0000-C000-000000000046")]
public interface IPersist
    void GetClassID(out Guid pClassID);

[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("00000109-0000-0000-C000-000000000046")]
public interface IPersistStream : IPersist
    new void GetClassID(out Guid pClassID);

    int IsDirty();

    void Load([In] IStream pStm);

    void Save(
        [In] IStream pStm,
        [In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);

    void GetSizeMax(out long pcbSize);

public struct StatChunk
    public int idChunk;
    public ChunkBreaktype breakType;
    public ChunkState flags;
    public int locale;
    public FullPropSpec attribute;
    public int idChunkSource;
    public int cwcStartSource;
    public int cwcLenSource;

public enum ChunkBreaktype
    CHUNK_EOW = 1,
    CHUNK_EOS = 2,
    CHUNK_EOP = 3,
    CHUNK_EOC = 4

public enum ChunkState
    CHUNK_TEXT = 0x1,
    CHUNK_VALUE = 0x2,

public enum FilterFlags

public enum FilterInit

public struct FilterRegion
    public int idChunk;
    public int cwcStart;
    public int cwcExtent;

public enum FilterReturnCodes : uint
    S_OK = 0,
    E_ACCESSDENIED = 0x80070005,
    E_HANDLE = 0x80070006,
    E_INVALIDARG = 0x80070057,
    E_OUTOFMEMORY = 0x8007000E,
    E_NOTIMPL = 0x80004001,
    E_FAIL = 0x80000008,
    FILTER_E_PASSWORD = 0x8004170B,
    FILTER_E_NO_TEXT = 0x80041705,
    FILTER_E_NO_VALUES = 0x80041706,
    FILTER_E_END_OF_CHUNKS = 0x80041700,
    FILTER_E_NO_MORE_TEXT = 0x80041701,
    FILTER_E_NO_MORE_VALUES = 0x80041702,
    FILTER_E_ACCESS = 0x80041703,
    FILTER_W_MONIKER_CLIPPED = 0x00041704,
    FILTER_S_LAST_TEXT = 0x00041709,
    FILTER_S_LAST_VALUES = 0x0004170A

public struct FullPropSpec
    public Guid guidPropSet;
    public PropSpec psProperty;

public struct PropSpec
    public int ulKind;     

    public int propid;

    public IntPtr lpwstr;

ho lavorato in passato di fornire un iFilter destinato a fornire qualsiasi ricerca / indicizzazione accesso strumento per il contenuto del testo all'interno di file di un AutoCAD DWG. Potete leggere alcune delle mia avventura qui: http://blogs.msdn.com/b/ifilter/archive/2006/12/25/chronicles-of-an-ifilter-development-inception-to-deployment.aspx

Il codice si fa riferimento è vecchio, ma ancora valido. Tuttavia, ora ci sono più interfacce in uso accanto GetTextFromFile. Sarà necessario utilizzare il lettore di flusso, leggere fino in IPersistStream nel link che ho citato sopra. Se ho ben capito che cosa si vuole fare, è necessario aprire il file come un flusso dal database e presentare questo flusso alla ricerca / indicizzatore o iFilter di vostra scelta.

In bocca al lupo, Marco

speravo di fare la stessa cosa, ma ho finito per l'aggiunta di un'altra colonna alla tabella di database per il TextContent. Ho salvato il BinaryContent in un file temporaneo, usato il CodeProject Epocalisde.IFilter libreria DLL per trovare il testo, e ha aggiunto che alla colonna TextContent.

Sulla Mareks esempio, ecco il mio prendere che utilizza un'implementazione del IStream interfaccia invece di allocare memoria attraverso Marshal.AllocHGlobal per creare un flusso di COM.

Funziona con il Adobe PDF iFilter 64 11.0.01 e una tonnellata di formati come .doc, .docx, .html, .odt, .rtf, l'elenco potrebbe continuare.

Esempio completo:

using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;

namespace TextExtraction
    class Program
        static void Main(string[] args)
            var file = new FileInfo(@"C:\Path\To\Some.doc");

            using (var stream = file.OpenRead())
                var filter = Load(stream, file.Extension);
                if (filter != null)
                    var text = GetText(filter);

            Console.WriteLine("Press your favorite key to exit");

        private static IFilter Load(Stream stream, string extension)
            IFilter filter = null;

            if (NativeMethods.LoadIFilter(extension, null, ref filter) == HRESULT.S_OK)
                if (filter is IPersistStream persistStream)
                    persistStream.Load(new ManagedStream(stream));

                    if (filter.Init(IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, 0, IntPtr.Zero, out IFILTER_FLAGS filterFlags) == IFilterReturnCodes.S_OK)
                        return filter;

            return null;

        private static string GetText(IFilter filter)
            var text = new StringBuilder();

            while (filter.GetChunk(out var chunk) == IFilterReturnCodes.S_OK)
                ReadChunk(filter, chunk, text);

            return text.ToString();

        private static void ReadChunk(IFilter filter, STAT_CHUNK chunk, StringBuilder text)
            var textResult = IFilterReturnCodes.S_OK;
            while (textResult == IFilterReturnCodes.S_OK)
                var bufferSize = 4096U;
                var buffer = new char[bufferSize];
                textResult = filter.GetText(ref bufferSize, buffer);

                if ((textResult == IFilterReturnCodes.S_OK || textResult == IFilterReturnCodes.FILTER_S_LAST_TEXT) && bufferSize > 0)
                    if (chunk.breakType == CHUNK_BREAKTYPE.CHUNK_EOP)

                    text.Append(buffer, 0, (int) bufferSize);

        public interface IFilter
            IFilterReturnCodes Init(IFILTER_INIT grfFlags, int cAttributes, IntPtr aAttributes,
                out IFILTER_FLAGS pdwFlags);

            IFilterReturnCodes GetChunk(out STAT_CHUNK pStat);

            IFilterReturnCodes GetText(ref uint pcwcBuffer, [Out, MarshalAs(UnmanagedType.LPArray)]
                char[] awcBuffer);

            IFilterReturnCodes GetValue(ref IntPtr propVal);

            IFilterReturnCodes BindRegion(ref FILTERREGION origPos, ref Guid riid, ref object ppunk);

        public interface IPersist
            void GetClassID(out Guid pClassID);

        public interface IPersistStream : IPersist
            new void GetClassID(out Guid pClassID);

            int IsDirty();

            void Load([In] IStream pStm);

            void Save([In] IStream pStm, [In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);

            void GetSizeMax(out long pcbSize);

        public interface IStream
            HRESULT Read([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)] [Out]
                byte[] pv, int cb, IntPtr pcbRead);

            HRESULT Write([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)]
                byte[] pv, int cb, IntPtr pcbWritten);

            HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition);

            HRESULT SetSize(long libNewSize);

            HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten);

            HRESULT Commit(int grfCommitFlags);

            HRESULT Revert();

            HRESULT LockRegion(long libOffset, long cb, int dwLockType);

            HRESULT UnlockRegion(long libOffset, long cb, int dwLockType);

            HRESULT Stat(out STATSTG pstatstg, int grfStatFlag);

            HRESULT Clone(out IStream ppstm);

        public class ManagedStream : IStream
            private readonly Stream _stream;

            public ManagedStream(Stream stream)
                _stream = stream ?? throw new ArgumentNullException(nameof(stream));

            public HRESULT Clone(out IStream ppstm)
                ppstm = null;
                return HRESULT.E_NOTIMPL;

            public HRESULT Commit(int grfCommitFlags)
                return HRESULT.E_NOTIMPL;

            public HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten)
                return HRESULT.E_NOTIMPL;

            public HRESULT LockRegion(long libOffset, long cb, int dwLockType)
                return HRESULT.E_NOTIMPL;

            public HRESULT Read(byte[] pv, int cb, IntPtr pcbRead)
                var bytesRead = _stream.Read(pv, 0, cb);
                if (pcbRead != IntPtr.Zero)
                    Marshal.WriteInt32(pcbRead, bytesRead);

                return HRESULT.S_OK;

            public HRESULT Revert()
                return HRESULT.E_NOTIMPL;

            public HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition)
                SeekOrigin seekOrigin;

                switch (dwOrigin)
                    case (int) STREAM_SEEK.STREAM_SEEK_SET:
                        seekOrigin = SeekOrigin.Begin;
                    case (int) STREAM_SEEK.STREAM_SEEK_CUR:
                        seekOrigin = SeekOrigin.Current;
                    case (int) STREAM_SEEK.STREAM_SEEK_END:
                        seekOrigin = SeekOrigin.End;
                        return HRESULT.E_FAIL;

                var position = _stream.Seek(dlibMove, seekOrigin);

                if (plibNewPosition != IntPtr.Zero)
                    Marshal.WriteInt64(plibNewPosition, position);

                return HRESULT.S_OK;

            public HRESULT SetSize(long libNewSize)
                return HRESULT.E_NOTIMPL;

            public HRESULT Stat(out STATSTG pstatstg, int grfStatFlag)
                pstatstg = new STATSTG
                    type = (int) STGTY.STGTY_STREAM,
                    cbSize = _stream.Length,
                    grfMode = (int) STGM.STGM_READ

                if (_stream.CanRead && _stream.CanWrite)
                    pstatstg.grfMode |= (int) STGM.STGM_READWRITE;
                else if (_stream.CanRead)
                    pstatstg.grfMode |= (int) STGM.STGM_READ;
                else if (_stream.CanWrite)
                    pstatstg.grfMode |= (int) STGM.STGM_WRITE;
                    return HRESULT.E_ACCESSDENIED;

                return HRESULT.S_OK;

            public HRESULT UnlockRegion(long libOffset, long cb, int dwLockType)
                return HRESULT.E_NOTIMPL;

            public HRESULT Write(byte[] pv, int cb, IntPtr pcbWritten)
                return HRESULT.E_NOTIMPL;

        public class NativeMethods
            [DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
            public static extern HRESULT LoadIFilter(string pwcsPath, [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter, ref IFilter ppIUnk);

        public struct FILETIME
            public uint DateTimeLow;
            public uint DateTimeHigh;

        public struct FILTERREGION
            public ulong idChunk;
            public ulong cwcStart;
            public ulong cwcExtent;

        public struct FULLPROPSPEC
            public Guid guidPropSet;
            public PROPSPEC psProperty;

        public struct PROPSPEC
            public PROPSPECKIND ulKind;

            public uint propid;

            public IntPtr lpwstr;

        public struct STAT_CHUNK
            public int idChunk;

            public CHUNK_BREAKTYPE breakType;

            public CHUNKSTATE flags;

            public int locale;

            public FULLPROPSPEC attribute;

            public int idChunkSource;

            public int cwcStartSource;

            public int cwcLenSource;

        public struct STATSTG
            public string pwcsName;
            public int type;
            public long cbSize;
            public FILETIME mtime;
            public FILETIME ctime;
            public FILETIME atime;
            public int grfMode;
            public int grfLocksSupported;
            public Guid clsid;
            public int grfStateBits;
            public int reserved;

        public enum IFilterReturnCodes : uint
            S_OK = 0,
            E_ACCESSDENIED = 0x80070005,
            E_HANDLE = 0x80070006,
            E_INVALIDARG = 0x80070057,
            E_OUTOFMEMORY = 0x8007000E,
            E_NOTIMPL = 0x80004001,
            E_FAIL = 0x80000008,

            FILTER_E_PASSWORD = 0x8004170B,
            FILTER_E_UNKNOWNFORMAT = 0x8004170C,
            FILTER_E_NO_TEXT = 0x80041705,
            FILTER_E_NO_VALUES = 0x80041706,
            FILTER_E_END_OF_CHUNKS = 0x80041700,
            FILTER_E_NO_MORE_TEXT = 0x80041701,
            FILTER_E_NO_MORE_VALUES = 0x80041702,
            FILTER_E_ACCESS = 0x80041703,
            FILTER_W_MONIKER_CLIPPED = 0x00041704,
            FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
            FILTER_E_LINK_UNAVAILABLE = 0x80041708,
            FILTER_S_LAST_TEXT = 0x00041709,
            FILTER_S_LAST_VALUES = 0x0004170A

        public enum CHUNK_BREAKTYPE : uint
            CHUNK_NO_BREAK = 0,
            CHUNK_EOW = 1,
            CHUNK_EOS = 2,
            CHUNK_EOP = 3,
            CHUNK_EOC = 4

        public enum CHUNKSTATE : uint
            CHUNK_TEXT = 0x1,
            CHUNK_VALUE = 0x2,
            CHUNK_FILTER_OWNED_VALUE = 0x4

        public enum HRESULT : uint
            S_OK = 0x00000000,
            E_NOTIMPL = 0x80004001,
            E_NOINTERFACE = 0x80004002,
            E_POINTER = 0x80004003,
            E_ABORT = 0x80004004,
            E_FAIL = 0x80004005,
            E_UNEXPECTED = 0x8000FFFF,
            E_ACCESSDENIED = 0x80070005,
            E_HANDLE = 0x80070006,
            E_OUTOFMEMORY = 0x8007000E,
            E_INVALIDARG = 0x80070057

        public enum IFILTER_FLAGS

        public enum IFILTER_INIT
            IFILTER_INIT_SEARCH_LINKS = 128,

        public enum PROPSPECKIND : ulong
            PRSPEC_LPWSTR = 0,
            PRSPEC_PROPID = 1

        public enum STGM : ulong
            STGM_READ = 0x00000000L,
            STGM_WRITE = 0x00000001L,
            STGM_READWRITE = 0x00000002L,
            STGM_SHARE_DENY_NONE = 0x00000040L,
            STGM_SHARE_DENY_READ = 0x00000030L,
            STGM_SHARE_DENY_WRITE = 0x00000020L,
            STGM_SHARE_EXCLUSIVE = 0x00000010L,
            STGM_PRIORITY = 0x00040000L,
            STGM_CREATE = 0x00001000L,
            STGM_CONVERT = 0x00020000L,
            STGM_FAILIFTHERE = 0x00000000L,
            STGM_DIRECT = 0x00000000L,
            STGM_TRANSACTED = 0x00010000L,
            STGM_NOSCRATCH = 0x00100000L,
            STGM_NOSNAPSHOT = 0x00200000L,
            STGM_SIMPLE = 0x08000000L,
            STGM_DIRECT_SWMR = 0x00400000L,
            STGM_DELETEONRELEASE = 0x04000000L

        public enum STGTY : int
            STGTY_STORAGE = 1,
            STGTY_STREAM = 2,
            STGTY_LOCKBYTES = 3,
            STGTY_PROPERTY = 4

        public enum STREAM_SEEK : int
            STREAM_SEEK_SET = 0,
            STREAM_SEEK_CUR = 1,
            STREAM_SEEK_END = 2
