I don't want any metadata added/extracted

https://stackoverflow.com/questions/22304612

12-06-2023
|

質問

I'm using Tika to index PDF files and in the Solr-UI I can see that a lot of metadata and other "stuffs" that I don't care are indexed too:

"response": {
    "numFound": 1,
    "start": 0,
    "docs": [
      {
        "meta": [
          "dc:subject",
          "",
          "meta:save-date",
          "2014-01-09T11:07:45Z",
          "subject",
          "",
          "Author",
          "smalik",
          "dcterms:created",
          "2014-01-09T11:07:45Z",
          "date",
          "2014-01-09T11:07:45Z",
          "creator",
          "smalik",
          "Creation-Date",
          "2014-01-09T11:07:45Z",
          "meta:author",
          "johndoe",
          "stream_content_type",
          "",
          "created",
          "Thu Jan 09 12:07:45 CET 2014",
          "stream_size",
          "null",
          "meta:keyword",
          "",
          "cp:subject",
          "",
          "xmp:CreatorTool",
          "PScript5.dll Version 5.2.2",
          "Keywords",
          "",
          "Last-Save-Date",
          "2014-01-09T11:07:45Z",
          "dc:title",
          "E-Mail zur Archivierung",
          "meta:creation-date",
          "2014-01-09T11:07:45Z",
          "dcterms:modified",
          "2014-01-09T11:07:45Z",
          "dc:creator",
          "johndoe",
          "Last-Modified",
          "2014-01-09T11:07:45Z",
          "modified",
          "2014-01-09T11:07:45Z",
          "xmpTPg:NPages",
          "1",
          "producer",
          "www.adlibsoftware.com:EXS41012-Windows 2008 R2:TNG",
          "Content-Type",
          "application/pdf"
        ],
        "div": [
          "page"
        ],
        "id": [
          "aaa11besd4effsujqub6toubqr4m3.pdf"
        ],
        "dc_subject": [
          ""
        ],
        "meta_save_date": [
          "2014-01-09T11:07:45Z"
        ],
        "subject": [
          ""
        ],
        "author": [
          "johndoe"
        ],
        "dcterms_created": [
          "2014-01-09T11:07:45Z"
        ],
        "date": [
          "2014-01-09T11:07:45Z"
        ],
        "creator": [
          "johndoe"
        ],
        "creation_date": [
          "2014-01-09T11:07:45Z"
        ],
        "title": [
          "E-Mail zur Archivierung"
        ],
        "meta_author": [
          "johndoe"
        ],
        "stream_content_type": [
          ""
        ],
        "created": [
          "Thu Jan 09 12:07:45 CET 2014"
        ],
        "stream_size": [
          "null"
        ],
        "meta_keyword": [
          ""
        ],
        "cp_subject": [
          ""
        ],
        "xmp_creatortool": [
          "PScript5.dll Version 5.2.2"
        ],
        "keywords": [
          ""
        ],
        "last_save_date": [
          "2014-01-09T11:07:45Z"
        ],
        "dc_title": [
          "E-Mail zur Archivierung"
        ],
        "meta_creation_date": [
          "2014-01-09T11:07:45Z"
        ],
        "dcterms_modified": [
          "2014-01-09T11:07:45Z"
        ],
        "dc_creator": [
          "johndoe"
        ],
        "last_modified": [
          "2014-01-09T11:07:45Z"
        ],
        "modified": [
          "2014-01-09T11:07:45Z"
        ],
        "xmptpg_npages": [
          "1"
        ],
        "producer": [
          "www.adlibsoftware.com:EXS41012-Windows 2008 R2:TNG"
        ],
        "content_type": [
          "application/pdf"
        ],
        "fullText": [" abcdef"],
        "uid": "d41d8cd98f00b204e9800998ecf8427e"
      }
    ]
  }

As I'm only interested in "fullText" and "id", I would like to know how/what I have to set/define in the schema.xml and/or solrconfig.xml to avoid all the unnecessary data.

What I would like is something like this:

"response": {
        "numFound": 1,
        "start": 0,
        "docs": [
          {
            "id": [
              "aaa11besd4effsujqub6toubqr4m3.pdf"
            ],
            "fullText": [" abcdef"],
            "uid": "d41d8cd98f00b204e9800998ecf8427e"
          }
        ]
      }

Actually my schema and solrconfig.xml look like this:

<?xml version="1.0" encoding="UTF-8" ?>
<schema name="simple" version="1.1">
    <types>
        <fieldtype name="string" class="solr.StrField" postingsFormat="SimpleText" />
        <fieldtype name="text" class="solr.TextField" postingsFormat="SimpleText">
            <analyzer>
                <charFilter class="solr.PatternReplaceCharFilterFactory" pattern="\n" replacement=""/>
                <tokenizer class="solr.StandardTokenizerFactory"/>
                <filter class="solr.LowerCaseFilterFactory" /> <!--Lowercases the letters in each token. Leaves non-letter tokens alone.-->
                <filter class="solr.ClassicFilterFactory" /> <!--Removes dots from acronyms and 's from the end of tokens. Works only on typed tokens produced by ClassicTokenizer or equivalent.-->
                <filter class="solr.TrimFilterFactory"/> <!--Trims whitespace at either end of a token. -->
                <filter class="solr.StopFilterFactory" ignoreCase="true"/> <!--Discards common words.  -->
                <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
            </analyzer>
        </fieldtype>
    </types>

    <fields>
        <field name="uid" type="string" indexed="true" stored="true"
            multiValued="false" />
        <dynamicField name="*" type="string" multiValued="true"
            indexed="true" stored="true" />
        <field name="content" indexed="true"  type="text" multiValued="true" />
    </fields>

    <defaultSearchField>content</defaultSearchField>

    <solrQueryParser defaultOperator="OR" />
    <uniqueKey>uid</uniqueKey>
</schema>


<?xml version="1.0" encoding="UTF-8" ?>
<config>
    <luceneMatchVersion>LUCENE_45</luceneMatchVersion>
    <directoryFactory name='DirectoryFactory' class='solr.MMapDirectoryFactory' />

    <codecFactory name="CodecFactory" class="solr.SchemaCodecFactory" />

    <lib dir='${solr.core.instanceDir}\lib' />
    <lib dir="${solr.core.instanceDir}\dist\" regex="solr-cell-\d.*\.jar" />
    <lib dir="${solr.core.instanceDir}\contrib\extraction\lib" regex=".*\.jar" />

    <requestHandler name="standard" class="solr.StandardRequestHandler" default="true" />

    <requestHandler name="/update" class="solr.UpdateRequestHandler">
        <lst name="defaults">
            <str name="update.chain">deduplication</str>
        </lst>
    </requestHandler>

    <requestHandler name="/update/extract"
        class="solr.extraction.ExtractingRequestHandler">
        <lst name="defaults">
            <str name="captureAttr">true</str>
            <str name="lowernames">true</str>
            <str name="overwrite">false</str>
            <str name="captureAttr">true</str>
            <str name="literalsOverride">true</str>
            <str name="uprefix">ignored_</str>
            <str name="fmap.a">link</str>
            <str name="fmap.content">fullText</str>
            <!-- the configuration here could be useful for tests -->
            <str name="update.chain">deduplication</str>
        </lst>
    </requestHandler>

    <updateRequestProcessorChain name="deduplication">
        <processor
            class="org.apache.solr.update.processor.SignatureUpdateProcessorFactory">
            <bool name="overwriteDupes">false</bool>
            <str name="signatureField">uid</str>
            <bool name="enabled">true</bool>
            <str name="fields">content</str>
            <str name="minTokenLen">10</str>
            <str name="quantRate">.2</str>
            <str name="signatureClass">solr.update.processor.TextProfileSignature</str>
        </processor>
        <processor class="solr.LogUpdateProcessorFactory" />
        <processor class="solr.RunUpdateProcessorFactory" />
    </updateRequestProcessorChain>

    <requestHandler name="/admin/"
        class="org.apache.solr.handler.admin.AdminHandlers" />

    <lockType>none</lockType>

    <admin>
        <defaultQuery>*:*</defaultQuery>
    </admin>

</config>

解決

See Alexandre's answer and examples here. If you are getting fields you do not need, then you need to explicitly declare them in your schema and set both indexed and stored to false (meaning Solr will ignore the field). You can also use dynamic fields to ignore a whole bunch of them with common prefix or suffix, which is typically the case with docs generated by Tika.

ライセンス： CC-BY-SA と帰属

所属していません StackOverflow