
Want to write tags for documents being inside a corpus. The tags are stored outside the corpus in a dataframe with the specific unique document-IDs.

The challenge: (1) take each ID from the dataframe, (2) find the coresponding document inside the corpus, (3) set the tag from the dataframe to the corpus document with the specific ID.

someID <- paste(letters[1:15], 16:30, sep="")
someTag <- sample(c("a","x","g","h","e"), 15, replace=TRUE)

data(crude) # a corpus with 20 docs
meta(crude, type="local", tag="someID") <- someID  # adding some additional IDs to the corpus

mydf <- data.frame(cbind(someTag, someID))  # Creating a dataframe with similar IDs
mydf <- mydf[sample(nrow(mydf)),]  # permutation of elements (rows)
rownames(mydf) <- 1:15  # overwriting the rownames

# doesn't work - my try - pseudocode

for (i in 1:nrow(mydf)){
          meta(crude[which(crude$someID==mydf$someID[i])], tag="someTag", type="local") <- mydf$someTag[i]

# How the data looks like:

# R output:
> mydf
   someTag someID
1        h    l27
2        x    g22
3        h    d19
4        a    e20
5        h    i24
6        x    j25
7        h    o30
8        x    n29
9        e    h23
10       x    m28
11       h    k26
12       e    c18
13       a    a16
14       e    b17
15       x    f21

meta(crude[1], type="local")
# R output:
> meta(crude[1], type="local")
Available meta data pairs are:
  Author       : 
  DateTimeStamp: 1987-02-26 17:00:56
  Description  : 
  ID           : 127
  Language     : en
  Origin       : Reuters-21578 XML
User-defined local meta data pairs are:
[1] "YES"

[1] "TRAIN"


[1] "5670"

[1] "crude"

[1] "usa"




[1] "a16"

Thank you for any help (;



According to ?meta

meta(crude, type="local", tag="someID") <- someID

will assign the meta data tag someID at the individual document level. What you want is to create a tagging at the collection level. For this, you want to manipulate the DMetaData attribute of the corpus crude. You can do this as:

meta(crude, type="indexed", tag="someID") <- someID

but I find it much easier to use the access

DMetaData(crude)$someID  <- someID

(this at least works for corpora of type VCorpus). With this adjustment:

someID <- paste(letters[1:15], 16:30, sep="")
someTag <- sample(c("a","x","g","h","e"), 15, replace=TRUE)

data(crude) # a corpus with 20 docs
# Need to be sure to allocate full tag and id set.
DMetaData(crude)$someID <- c(someID,rep(NA,5))
DMetaData(crude)$someTag <- rep(NA,20)

mydf <- data.frame(cbind(someTag, someID), stringsAsFactors=FALSE)  # Creating a dataframe with similar IDs
mydf <- mydf[sample(nrow(mydf)),]  # permutation of elements (rows)
rownames(mydf) <- 1:15  # overwriting the rownames

for (i in 1:nrow(mydf)){
      DMetaData(crude)$someTag[DMetaData(crude)$someID==mydf$someID[i]]<- mydf$someTag[i]


> DMetaData(crude)
   MetaID someID someTag  
1       0    a16       a
2       0    b17       h
3       0    c18       g
4       0    d19       a
5       0    e20       e
6       0    f21       a
7       0    g22       x
8       0    h23       g
9       0    i24       h
10      0    j25       e
11      0    k26       x
12      0    l27       a
13      0    m28       a
14      0    n29       h
15      0    o30       a
16      0   <NA>    <NA>
17      0   <NA>    <NA>
18      0   <NA>    <NA>
19      0   <NA>    <NA>
20      0   <NA>    <NA>
ライセンス: CC-BY-SA帰属
所属していません StackOverflow
scroll top