Domanda

I have a data.frame which describes a bipartite graph with a very large (millions) and a reasonably small (hundreds) independent sets.

I want to get the bipartite projection of the graph on the smaller independent set, but without first creating the large bipartite graph and especially the huge bipartite projection to the larger independent set. The reason for this limitation is an igraph segfault and a RAM limitation (I have only 8GB RAM).

E.g., given

data.frame(beg=c("a","a","b","b","c","c"),
           end=c("1","2","1","2","1","2"),
           weight=1:6)

I want data frame

data.frame(beg=c("a","a","b"),
           end=c("b","c","c"),
           weight=c(1+3+2+4,1+5+2+6,3+5+4+6))

where weights of edges add up.

(in this example, abc is the "smaller" set and 12 is the "larger" one).

È stato utile?

Soluzione

Here is something which appears to do what I need (the key is to use data.table for fast join):

> library(igraph)
> library(data.table)
data.table 1.8.8  For help type: help("data.table")
> f <- data.frame(beg=c("a","a","b","b","c","c"),
                  end=c("1","2","1","2","1","2"),
                  count=1:6)
> f
   beg end count
1:   a   1     1
2:   b   1     3
3:   c   1     5
4:   a   2     2
5:   b   2     4
6:   c   2     6
> m <- f[f,allow.cartesian=TRUE]

> m
    end beg weight beg.1 weight.1
 1:   1   a      1     a        1
 2:   1   b      3     a        1
 3:   1   c      5     a        1
 4:   1   a      1     b        3
 5:   1   b      3     b        3
 6:   1   c      5     b        3
 7:   1   a      1     c        5
 8:   1   b      3     c        5
 9:   1   c      5     c        5
10:   2   a      2     a        2
11:   2   b      4     a        2
12:   2   c      6     a        2
13:   2   a      2     b        4
14:   2   b      4     b        4
15:   2   c      6     b        4
16:   2   a      2     c        6
17:   2   b      4     c        6
18:   2   c      6     c        6
> v <- m$beg == m$beg.1
> m <- f[f,allow.cartesian=TRUE]
> v <- m$beg == m$beg.1
> m$end <- NULL
> m$weight <- (m$count + m$count.1)/2
> m$count <- NULL
> m$count.1 <- NULL
> m
    beg beg.1 weight
 1:   a     a      1
 2:   b     a      2
 3:   c     a      3
 4:   a     b      2
 5:   b     b      3
 6:   c     b      4
 7:   a     c      3
 8:   b     c      4
 9:   c     c      5
10:   a     a      2
11:   b     a      3
12:   c     a      4
13:   a     b      3
14:   b     b      4
15:   c     b      5
16:   a     c      4
17:   b     c      5
18:   c     c      6
> ve <- data.table(vertex=m$beg[v], weight=m$weight[v], key="vertex")
> ve <- ve[, list(count = .N, weight = sum(weight)), by = "vertex"]
> ve
   vertex count weight
1:      a     2      3
2:      b     2      7
3:      c     2     11
> g1 <- graph.data.frame(m[!v,], vertices=ve, directed=FALSE)
> g1 <- simplify(g1, edge.attr.comb="sum")
> V(g1)$weight
[1]  3  7 11
> E(g1)$weight
[1] 10 14 18

Altri suggerimenti

So here is how I'd do it (assuming your edge is in df, and the "small" set is in the beginning of an edge)

For every pair of nodes in the small set I'll use the following:

do.pair = function(x,y) {
     tmp = intersect(df$end[df$beg==x],df$end[df$beg==y])
     res = sum(df$weight[(df$beg %in% c(x,y)) & (df$end %in% tmp)])
     return(res)
}

now, I'd create the list of pairs in your favorite way (you can use exapnd.grid or outer) and then use the relevant apply function with the above, here I just do a simple nested loop, not extremely efficient but simple to read.

g.small = unique(df$beg)
n = length(g.small)
res = list()
cnt=0
for (i in 1:(n-1)) {
    for (j in (i+1):n) {
       cnt = cnt+1
       res[[cnt]] = list(beg=g.small[i],end=g.small[j],weight=do.pair(g.small[i],g.small[j]))
    }
}

do.call(rbind,res)
Autorizzato sotto: CC-BY-SA insieme a attribuzione
Non affiliato a StackOverflow
scroll top