
For a C# web application I am wanting to index text from PDF, DOC, etc files stored in a database.

I have been experimenting with an IFilter example on Code Project which works great for files from the file system, but my files are stored in a MS-SQL database.

Can anyone help me locate a sample to extract text from files stored in a database or have an idea on how to modify the Code Project code to work with a database instead of the file system?

Finally after many hours I figured out how to make this work! I needed to run IFilter on PDF content stored in a database and I wanted to avoid saving the data to temporary files.

First I tried to use the BindIFilterFromStream API to create an IFilter for content stored in a Stream, but it seems that it doesn't work properly (at least not for this scenario). So don't go that way.

Instead you need to create an IFilter for a file extension (or access it some other way). Then you can access the IPersistStream COM interface and use it to load the PDF content into the IFilter. The rest works the same as for files. However, note that the IPersistStream API may not be implemented by every IFilter. It works for the Adobe PDF IFilter though.

The code should look like this (I removed some return code checking to make the code more readable, however, you should check all possible return codes).

private string ParseIFilter(Stream s)
   // Get an IFilter for a file or file extension
   IFilter filter = null;
   FilterReturnCodes result = NativeMethods.LoadIFilter(".pdf", null, ref filter);
   if (result != FilterReturnCodes.S_OK)

   // Copy the content to global memory
   byte[] buffer = new byte[s.Length];
   s.Read(buffer, 0, buffer.Length);
   IntPtr nativePtr = Marshal.AllocHGlobal(buffer.Length);
   Marshal.Copy(buffer, 0, nativePtr, buffer.Length);

   // Create a COM stream
   System.Runtime.InteropServices.ComTypes.IStream comStream;
   NativeMethods.CreateStreamOnHGlobal(nativePtr, true, out comStream);

   // Load the contents to the iFilter using IPersistStream interface
   var persistStream = (IPersistStream)filter;

   // Initialize iFilter
   FilterFlags filterFlags;
   FilterReturnCodes result = filter.Init(
      FilterInit.IFILTER_INIT_INDEXING_ONLY, 0, IntPtr.Zero, out filterFlags);

   return ExtractTextFromIFilter(filter);

Text extraction from the filter looks like this in my code. There are many examples of this on the web and it can be implemented in many ways depending on what you need.

private string ExtractTextFromIFilter(IFilter filter)
   var sb = new StringBuilder();

   while (true)
      StatChunk chunk;
      result = filter.GetChunk(out chunk);

      if (result == FilterReturnCodes.S_OK)
         if (chunk.flags == ChunkState.CHUNK_TEXT)
            sb.Append(ExtractTextFromChunk(filter, chunk));


      if (result == FilterReturnCodes.FILTER_E_END_OF_CHUNKS)
         return sb.ToString();


private virtual string ExtractTextFromChunk(IFilter filter, StatChunk chunk)
   var sb = new StringBuilder();

   var result = FilterReturnCodes.S_OK;
   while (result == FilterReturnCodes.S_OK)
      int sizeBuffer = 16384;
      var buffer = new StringBuilder(sizeBuffer);
      result = filter.GetText(ref sizeBuffer, buffer);

      if ((result == FilterReturnCodes.S_OK) || (result == FilterReturnCodes.FILTER_S_LAST_TEXT))
         if((sizeBuffer > 0) && (buffer.Length > 0))
            sb.Append(buffer.ToString(0, sizeBuffer));

      if (result == FilterReturnCodes.FILTER_E_NO_TEXT)
         return string.Empty;

      if ((result == FilterReturnCodes.FILTER_S_LAST_TEXT) || (result == FilterReturnCodes.FILTER_E_NO_MORE_TEXT))
         return sb.ToString();

   return sb.ToString();

And here are the definitions of native methods and the structures used by them.

internal static class NativeMethods
    [DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
    public static extern FilterReturnCodes LoadIFilter(
        string pwcsPath,
        [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter,
        ref IFilter ppIUnk);

    public static extern int CreateStreamOnHGlobal(IntPtr hGlobal, bool fDeleteOnRelease, out IStream ppstm);

[ComImport, Guid("89BCB740-6119-101A-BCB7-00DD010655AF")]
public interface IFilter
    FilterReturnCodes Init(FilterInit grfFlags, int cAttributes, IntPtr aAttributes, out FilterFlags pdwFlags);

    FilterReturnCodes GetChunk(out StatChunk pStat);

    FilterReturnCodes GetText(
        ref int pcwcBuffer,
        [Out, MarshalAs(UnmanagedType.LPWStr)] StringBuilder awcBuffer);

    FilterReturnCodes GetValue(ref IntPtr propVal);

    FilterReturnCodes BindRegion(ref FilterRegion origPos, ref Guid riid, ref object ppunk);

[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("0000010c-0000-0000-C000-000000000046")]
public interface IPersist
    void GetClassID(out Guid pClassID);

[InterfaceType(ComInterfaceType.InterfaceIsIUnknown), Guid("00000109-0000-0000-C000-000000000046")]
public interface IPersistStream : IPersist
    new void GetClassID(out Guid pClassID);

    int IsDirty();

    void Load([In] IStream pStm);

    void Save(
        [In] IStream pStm,
        [In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);

    void GetSizeMax(out long pcbSize);

public struct StatChunk
    public int idChunk;
    public ChunkBreaktype breakType;
    public ChunkState flags;
    public int locale;
    public FullPropSpec attribute;
    public int idChunkSource;
    public int cwcStartSource;
    public int cwcLenSource;

public enum ChunkBreaktype
    CHUNK_EOW = 1,
    CHUNK_EOS = 2,
    CHUNK_EOP = 3,
    CHUNK_EOC = 4

public enum ChunkState
    CHUNK_TEXT = 0x1,
    CHUNK_VALUE = 0x2,

public enum FilterFlags

public enum FilterInit

public struct FilterRegion
    public int idChunk;
    public int cwcStart;
    public int cwcExtent;

public enum FilterReturnCodes : uint
    S_OK = 0,
    E_ACCESSDENIED = 0x80070005,
    E_HANDLE = 0x80070006,
    E_INVALIDARG = 0x80070057,
    E_OUTOFMEMORY = 0x8007000E,
    E_NOTIMPL = 0x80004001,
    E_FAIL = 0x80000008,
    FILTER_E_PASSWORD = 0x8004170B,
    FILTER_E_NO_TEXT = 0x80041705,
    FILTER_E_NO_VALUES = 0x80041706,
    FILTER_E_END_OF_CHUNKS = 0x80041700,
    FILTER_E_NO_MORE_TEXT = 0x80041701,
    FILTER_E_NO_MORE_VALUES = 0x80041702,
    FILTER_E_ACCESS = 0x80041703,
    FILTER_W_MONIKER_CLIPPED = 0x00041704,
    FILTER_S_LAST_TEXT = 0x00041709,
    FILTER_S_LAST_VALUES = 0x0004170A

public struct FullPropSpec
    public Guid guidPropSet;
    public PropSpec psProperty;

public struct PropSpec
    public int ulKind;     

    public int propid;

    public IntPtr lpwstr;


I have worked in the past on providing an iFilter intended to provide any search/indexing tool access to text contents inside an AutoCad dwg file. You can read some of my adventure here:

The code you are referring to is old, but still valid. However, there are now more interfaces in use beside GetTextFromFile. You will need to use the stream reader, read up in IPersistStream in the link I mentioned above. If I understand what you want to do, you'll need to open the file as a stream from the database and present this stream to the search/indexer or the iFilter of your choice.

Good luck, Marco

I was hoping to do the same thing, but I ended up adding another column to the database table for the TextContent. I saved the BinaryContent to a temporary file, used the CodeProject library Epocalisde.IFilter dll to find the Text, and added that to the TextContent column.

Building on Mareks example, here's my take which uses an implementation of the IStream interface instead of allocating memory through Marshal.AllocHGlobal to create a COM stream.

It works with the Adobe PDF iFilter 64 11.0.01 and a ton of formats such as .doc, .docx, .html, .odt, .rtf, the list goes on.

Complete example:

using System;
using System.IO;
using System.Runtime.InteropServices;
using System.Text;

namespace TextExtraction
    class Program
        static void Main(string[] args)
            var file = new FileInfo(@"C:\Path\To\Some.doc");

            using (var stream = file.OpenRead())
                var filter = Load(stream, file.Extension);
                if (filter != null)
                    var text = GetText(filter);

            Console.WriteLine("Press your favorite key to exit");

        private static IFilter Load(Stream stream, string extension)
            IFilter filter = null;

            if (NativeMethods.LoadIFilter(extension, null, ref filter) == HRESULT.S_OK)
                if (filter is IPersistStream persistStream)
                    persistStream.Load(new ManagedStream(stream));

                    if (filter.Init(IFILTER_INIT.IFILTER_INIT_APPLY_INDEX_ATTRIBUTES, 0, IntPtr.Zero, out IFILTER_FLAGS filterFlags) == IFilterReturnCodes.S_OK)
                        return filter;

            return null;

        private static string GetText(IFilter filter)
            var text = new StringBuilder();

            while (filter.GetChunk(out var chunk) == IFilterReturnCodes.S_OK)
                ReadChunk(filter, chunk, text);

            return text.ToString();

        private static void ReadChunk(IFilter filter, STAT_CHUNK chunk, StringBuilder text)
            var textResult = IFilterReturnCodes.S_OK;
            while (textResult == IFilterReturnCodes.S_OK)
                var bufferSize = 4096U;
                var buffer = new char[bufferSize];
                textResult = filter.GetText(ref bufferSize, buffer);

                if ((textResult == IFilterReturnCodes.S_OK || textResult == IFilterReturnCodes.FILTER_S_LAST_TEXT) && bufferSize > 0)
                    if (chunk.breakType == CHUNK_BREAKTYPE.CHUNK_EOP)

                    text.Append(buffer, 0, (int) bufferSize);

        public interface IFilter
            IFilterReturnCodes Init(IFILTER_INIT grfFlags, int cAttributes, IntPtr aAttributes,
                out IFILTER_FLAGS pdwFlags);

            IFilterReturnCodes GetChunk(out STAT_CHUNK pStat);

            IFilterReturnCodes GetText(ref uint pcwcBuffer, [Out, MarshalAs(UnmanagedType.LPArray)]
                char[] awcBuffer);

            IFilterReturnCodes GetValue(ref IntPtr propVal);

            IFilterReturnCodes BindRegion(ref FILTERREGION origPos, ref Guid riid, ref object ppunk);

        public interface IPersist
            void GetClassID(out Guid pClassID);

        public interface IPersistStream : IPersist
            new void GetClassID(out Guid pClassID);

            int IsDirty();

            void Load([In] IStream pStm);

            void Save([In] IStream pStm, [In, MarshalAs(UnmanagedType.Bool)] bool fClearDirty);

            void GetSizeMax(out long pcbSize);

        public interface IStream
            HRESULT Read([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)] [Out]
                byte[] pv, int cb, IntPtr pcbRead);

            HRESULT Write([MarshalAs(UnmanagedType.LPArray, SizeParamIndex = 1)]
                byte[] pv, int cb, IntPtr pcbWritten);

            HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition);

            HRESULT SetSize(long libNewSize);

            HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten);

            HRESULT Commit(int grfCommitFlags);

            HRESULT Revert();

            HRESULT LockRegion(long libOffset, long cb, int dwLockType);

            HRESULT UnlockRegion(long libOffset, long cb, int dwLockType);

            HRESULT Stat(out STATSTG pstatstg, int grfStatFlag);

            HRESULT Clone(out IStream ppstm);

        public class ManagedStream : IStream
            private readonly Stream _stream;

            public ManagedStream(Stream stream)
                _stream = stream ?? throw new ArgumentNullException(nameof(stream));

            public HRESULT Clone(out IStream ppstm)
                ppstm = null;
                return HRESULT.E_NOTIMPL;

            public HRESULT Commit(int grfCommitFlags)
                return HRESULT.E_NOTIMPL;

            public HRESULT CopyTo(IStream pstm, long cb, IntPtr pcbRead, IntPtr pcbWritten)
                return HRESULT.E_NOTIMPL;

            public HRESULT LockRegion(long libOffset, long cb, int dwLockType)
                return HRESULT.E_NOTIMPL;

            public HRESULT Read(byte[] pv, int cb, IntPtr pcbRead)
                var bytesRead = _stream.Read(pv, 0, cb);
                if (pcbRead != IntPtr.Zero)
                    Marshal.WriteInt32(pcbRead, bytesRead);

                return HRESULT.S_OK;

            public HRESULT Revert()
                return HRESULT.E_NOTIMPL;

            public HRESULT Seek(long dlibMove, int dwOrigin, IntPtr plibNewPosition)
                SeekOrigin seekOrigin;

                switch (dwOrigin)
                    case (int) STREAM_SEEK.STREAM_SEEK_SET:
                        seekOrigin = SeekOrigin.Begin;
                    case (int) STREAM_SEEK.STREAM_SEEK_CUR:
                        seekOrigin = SeekOrigin.Current;
                    case (int) STREAM_SEEK.STREAM_SEEK_END:
                        seekOrigin = SeekOrigin.End;
                        return HRESULT.E_FAIL;

                var position = _stream.Seek(dlibMove, seekOrigin);

                if (plibNewPosition != IntPtr.Zero)
                    Marshal.WriteInt64(plibNewPosition, position);

                return HRESULT.S_OK;

            public HRESULT SetSize(long libNewSize)
                return HRESULT.E_NOTIMPL;

            public HRESULT Stat(out STATSTG pstatstg, int grfStatFlag)
                pstatstg = new STATSTG
                    type = (int) STGTY.STGTY_STREAM,
                    cbSize = _stream.Length,
                    grfMode = (int) STGM.STGM_READ

                if (_stream.CanRead && _stream.CanWrite)
                    pstatstg.grfMode |= (int) STGM.STGM_READWRITE;
                else if (_stream.CanRead)
                    pstatstg.grfMode |= (int) STGM.STGM_READ;
                else if (_stream.CanWrite)
                    pstatstg.grfMode |= (int) STGM.STGM_WRITE;
                    return HRESULT.E_ACCESSDENIED;

                return HRESULT.S_OK;

            public HRESULT UnlockRegion(long libOffset, long cb, int dwLockType)
                return HRESULT.E_NOTIMPL;

            public HRESULT Write(byte[] pv, int cb, IntPtr pcbWritten)
                return HRESULT.E_NOTIMPL;

        public class NativeMethods
            [DllImport("query.dll", SetLastError = true, CharSet = CharSet.Unicode)]
            public static extern HRESULT LoadIFilter(string pwcsPath, [MarshalAs(UnmanagedType.IUnknown)] object pUnkOuter, ref IFilter ppIUnk);

        public struct FILETIME
            public uint DateTimeLow;
            public uint DateTimeHigh;

        public struct FILTERREGION
            public ulong idChunk;
            public ulong cwcStart;
            public ulong cwcExtent;

        public struct FULLPROPSPEC
            public Guid guidPropSet;
            public PROPSPEC psProperty;

        public struct PROPSPEC
            public PROPSPECKIND ulKind;

            public uint propid;

            public IntPtr lpwstr;

        public struct STAT_CHUNK
            public int idChunk;

            public CHUNK_BREAKTYPE breakType;

            public CHUNKSTATE flags;

            public int locale;

            public FULLPROPSPEC attribute;

            public int idChunkSource;

            public int cwcStartSource;

            public int cwcLenSource;

        public struct STATSTG
            public string pwcsName;
            public int type;
            public long cbSize;
            public FILETIME mtime;
            public FILETIME ctime;
            public FILETIME atime;
            public int grfMode;
            public int grfLocksSupported;
            public Guid clsid;
            public int grfStateBits;
            public int reserved;

        public enum IFilterReturnCodes : uint
            S_OK = 0,
            E_ACCESSDENIED = 0x80070005,
            E_HANDLE = 0x80070006,
            E_INVALIDARG = 0x80070057,
            E_OUTOFMEMORY = 0x8007000E,
            E_NOTIMPL = 0x80004001,
            E_FAIL = 0x80000008,

            FILTER_E_PASSWORD = 0x8004170B,
            FILTER_E_UNKNOWNFORMAT = 0x8004170C,
            FILTER_E_NO_TEXT = 0x80041705,
            FILTER_E_NO_VALUES = 0x80041706,
            FILTER_E_END_OF_CHUNKS = 0x80041700,
            FILTER_E_NO_MORE_TEXT = 0x80041701,
            FILTER_E_NO_MORE_VALUES = 0x80041702,
            FILTER_E_ACCESS = 0x80041703,
            FILTER_W_MONIKER_CLIPPED = 0x00041704,
            FILTER_E_EMBEDDING_UNAVAILABLE = 0x80041707,
            FILTER_E_LINK_UNAVAILABLE = 0x80041708,
            FILTER_S_LAST_TEXT = 0x00041709,
            FILTER_S_LAST_VALUES = 0x0004170A

        public enum CHUNK_BREAKTYPE : uint
            CHUNK_NO_BREAK = 0,
            CHUNK_EOW = 1,
            CHUNK_EOS = 2,
            CHUNK_EOP = 3,
            CHUNK_EOC = 4

        public enum CHUNKSTATE : uint
            CHUNK_TEXT = 0x1,
            CHUNK_VALUE = 0x2,
            CHUNK_FILTER_OWNED_VALUE = 0x4

        public enum HRESULT : uint
            S_OK = 0x00000000,
            E_NOTIMPL = 0x80004001,
            E_NOINTERFACE = 0x80004002,
            E_POINTER = 0x80004003,
            E_ABORT = 0x80004004,
            E_FAIL = 0x80004005,
            E_UNEXPECTED = 0x8000FFFF,
            E_ACCESSDENIED = 0x80070005,
            E_HANDLE = 0x80070006,
            E_OUTOFMEMORY = 0x8007000E,
            E_INVALIDARG = 0x80070057

        public enum IFILTER_FLAGS

        public enum IFILTER_INIT
            IFILTER_INIT_SEARCH_LINKS = 128,

        public enum PROPSPECKIND : ulong
            PRSPEC_LPWSTR = 0,
            PRSPEC_PROPID = 1

        public enum STGM : ulong
            STGM_READ = 0x00000000L,
            STGM_WRITE = 0x00000001L,
            STGM_READWRITE = 0x00000002L,
            STGM_SHARE_DENY_NONE = 0x00000040L,
            STGM_SHARE_DENY_READ = 0x00000030L,
            STGM_SHARE_DENY_WRITE = 0x00000020L,
            STGM_SHARE_EXCLUSIVE = 0x00000010L,
            STGM_PRIORITY = 0x00040000L,
            STGM_CREATE = 0x00001000L,
            STGM_CONVERT = 0x00020000L,
            STGM_FAILIFTHERE = 0x00000000L,
            STGM_DIRECT = 0x00000000L,
            STGM_TRANSACTED = 0x00010000L,
            STGM_NOSCRATCH = 0x00100000L,
            STGM_NOSNAPSHOT = 0x00200000L,
            STGM_SIMPLE = 0x08000000L,
            STGM_DIRECT_SWMR = 0x00400000L,
            STGM_DELETEONRELEASE = 0x04000000L

        public enum STGTY : int
            STGTY_STORAGE = 1,
            STGTY_STREAM = 2,
            STGTY_LOCKBYTES = 3,
            STGTY_PROPERTY = 4

        public enum STREAM_SEEK : int
            STREAM_SEEK_SET = 0,
            STREAM_SEEK_CUR = 1,
            STREAM_SEEK_END = 2
