text analysis for imdb movie review using rapidminer

https://stackoverflow.com/questions/22254057

11-06-2023
|

Question

I am doing analysis on reviews for a particular movie using rapid miner. I used "getpages" to extract the reviews from IMDB. There are around 94 reviews listed over the site but after extraction i am getting only 21 out of them. The xml code is:

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
        <parameter key="excel_file" value="C:\Users\Arbind\Desktop\review\rev.xlsx"/>
        <parameter key="imported_cell_range" value="A1:A5"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Link.true.text.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.3.001" expanded="true" height="60" name="Get Pages" width="90" x="179" y="30">
        <parameter key="link_attribute" value="Link"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="313" y="30">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="gensym1"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="30">
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prune_above_percent" value="90.0"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:cut_document" compatibility="5.3.002" expanded="true" height="60" name="Cut Document" width="90" x="112" y="30">
            <parameter key="query_type" value="Regular Region"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries">
              <parameter key="extract" value="&lt;hr[^&gt;]\.*align=&quot;center&quot;&gt;.&lt;hr[^&gt;]\.*align=&quot;center&quot;&gt;"/>
            </list>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <list key="index_queries"/>
            <process expanded="true">
              <operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information" width="90" x="112" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="review" value="//h:p/text()"/>
                  <parameter key="rating" value="//h:img/@alt"/>
                </list>
                <list key="namespaces"/>
                <list key="index_queries"/>
              </operator>
              <connect from_port="segment" to_op="Extract Information" to_port="document"/>
              <connect from_op="Extract Information" from_port="document" to_port="document 1"/>
              <portSpacing port="source_segment" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="document" to_op="Cut Document" to_port="document"/>
          <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_excel" compatibility="5.3.015" expanded="true" height="76" name="Write Excel" width="90" x="514" y="210">
        <parameter key="excel_file" value="C:\Users\Arbind\Desktop\review\imdb rev2.xlsx"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Write Excel" to_port="input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

In the excel file i took the links of reviews one after the other. Also i need the "ratings" given by the users.

Solution

The regular region setting is finding every other review. This is because the regular expression for the start of the region is the same as the one for the end. Once an end has been found no start will be found until the end of the next review.

OTHER TIPS

The imported cell range is A1:A5 on the spreadsheet import. This means there is an implied for loop to fetch these 5 URLs. This in turn leads to Get Pages which could follow redirects leading to many more possible fetches. It's impossible to know without knowing the URLs.

Can you post a snippet of the spreadsheet?

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow