Question

i've a TFIDF vectors that is saved in a MYSQL table inside the database the table has the following schema :

id | docid | word    | weight | class/label | timestamp
1  | 1     | argon   | 0.2123 | pos         | 2013-03-25 16:22:48
2  | 1     | apple   | 0.1523 | pos         | 2013-03-25 16:22:48
3  | 2     | orange  | 0.8823 | pos         | 2013-03-25 16:22:48
4  | 2     | diffuse | 0.9812 | pos         | 2013-03-25 16:22:48
5  | 3     | master  | 0.2653 | neg         | 2013-03-25 16:22:48
6  | 3     | mouse   | 0.7623 | neg         | 2013-03-25 16:22:48

the vectors of all documents are on the same tables vertically and differentiated by the docid field

i want to load them inside RapidMiner in order to build a classifier for pos and neg classes and as far as i know the format that the Classifier model in RapidMiner accepts is each document is represented horizontally in a row , like this :

docid | class/label | argon |apple   | orange  | diffuse | .... 
1     | pos         | 0.154 |0       | 0.1326  | 0.7741  | ....
2     | pos         | 0.545 |0       | 0       | 0.77    | ....
3     | neg         | 0.565 |0.122   | 0.1555  | 0       | ....

I can write some code to do this task and save them in CSV files and hence upload them to rapid miner , i wanted a more faster task within rapid miner. either by making it accepts the first format or change it to the second or even a MYSQL Query to do that.

taking into consideration that vectors table is very large ( around 500 mb ) so scalability is an issue

Was it helpful?

Solution

The "Pivot" operator will do this job for you. Set docid as your group attribute and word as the index attribute and you will get something similar to what you want. To get exactly what you want, you will have to remove the id attribute before, rename the attributes and replace the missings afterwards.

I have built a small example for you. Ignore the CSV operator and replace it with your "Read database" operator.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.009">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.009" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="read_csv" compatibility="5.3.009" expanded="true" height="60" name="Read CSV" width="90" x="45" y="30">
            <parameter key="csv_file" value="~/temp/stackoverflow/vectors.csv"/>
            <parameter key="trim_lines" value="true"/>
            <parameter key="first_row_as_names" value="false"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <parameter key="encoding" value="UTF-8"/>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="id.true.integer.attribute"/>
              <parameter key="1" value="docid.true.integer.attribute"/>
              <parameter key="2" value="word.true.polynominal.attribute"/>
              <parameter key="3" value="weight.true.real.attribute"/>
              <parameter key="4" value="class/label.true.binominal.attribute"/>
              <parameter key="5" value="timestamp.true.binominal.attribute"/>
            </list>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="5.3.009" expanded="true" height="76" name="Remove id" width="90" x="179" y="30">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="id"/>
            <parameter key="regular_expression" value="id_.*"/>
            <parameter key="invert_selection" value="true"/>
          </operator>
          <operator activated="true" class="pivot" compatibility="5.3.009" expanded="true" height="76" name="Pivot" width="90" x="313" y="30">
            <parameter key="group_attribute" value="docid"/>
            <parameter key="index_attribute" value="word"/>
          </operator>
          <operator activated="true" class="rename_by_replacing" compatibility="5.3.009" expanded="true" height="76" name="Remove prefix" width="90" x="447" y="30">
            <parameter key="replace_what" value="weight_"/>
          </operator>
          <operator activated="true" class="replace_missing_values" compatibility="5.3.009" expanded="true" height="94" name="Replace Missing Values" width="90" x="581" y="30">
            <parameter key="attribute_filter_type" value="value_type"/>
            <parameter key="value_type" value="numeric"/>
            <parameter key="default" value="zero"/>
            <list key="columns"/>
          </operator>
          <connect from_op="Read CSV" from_port="output" to_op="Remove id" to_port="example set input"/>
          <connect from_op="Remove id" from_port="example set output" to_op="Pivot" to_port="example set input"/>
          <connect from_op="Pivot" from_port="example set output" to_op="Remove prefix" to_port="example set input"/>
          <connect from_op="Remove prefix" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
          <connect from_op="Replace Missing Values" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top