Question

Using the following data:

    > mytable<-read.delim("mytable.csv",sep=",",header=T)
    > class(mytable)
   [1] "data.frame"
    > mytable

         count lang1 lang2
    1   908446    ar    ar
    2       96    ar    bg
    3       73    ar    bo
    4        2    ar   chr
    5       61    ar    da
    6     1282    ar    de
    7       84    ar    el
    8    28067    ar    en
    9     1178    ar    es
    10     962    ar    et
    11   25945    ar    fa
    12     100    ar    fi
    13     765    ar    fr
    14      18    ar    he
    15       1    ar    hi
    16    1036    ar    ht
    17     267    ar    hu
    18      17    ar    hy
    19    3306    ar    id
    20      23    ar    is
    21     262    ar    it
    22       1    ar    iu
    23     265    ar    ja
    24      46    ar    ka
    25     400    ar    ko
    26      43    ar    lt
    27     160    ar    lv
    28       1    ar    my
    29    1539    ar    nl
    30      28    ar    no
    31  558362    ar  none
    32     507    ar    pl
    33     847    ar    pt
    34     577    ar    ru
    35     369    ar    sk
    36     309    ar    sl
    37     127    ar    sv
    38       1    ar    ta
    39       9    ar    th
    40     911    ar    tl
    41     585    ar    tr
    42       3    ar    uk
    43   46861    ar   und
    44    6499    ar    ur
    45    2245    ar    vi
    46      17    ar    zh
    47      13    ca    ar
    48       1    ca    bg
    49      27    ca    da
    50     100    ca    de
    51     946    ca    en
    52    8840    ca    es
    53      56    ca    et
    54      15    ca    fi
    55     912    ca    fr
    56      97    ca    ht
    57      64    ca    hu
    58      96    ca    id
    59       8    ca    is
    60     556    ca    it
    61      12    ca    ja
    62       2    ca    ko
    63      13    ca    lt
    64      58    ca    lv
    65      47    ca    nl
    66       6    ca    no
    67    7729    ca  none
    68      26    ca    pl
    69    1032    ca    pt
    70      10    ca    ru
    71      62    ca    sk
    72      57    ca    sl
    73      32    ca    sv
    74      93    ca    tl
    75      39    ca    tr
    76     275    ca   und
    77      53    ca    vi
    78      14    cs    ar
    79      33    cs    bg
    80       1    cs    da
    81      64    cs    de
    82    1729    cs    en
    83     162    cs    es
    84      47    cs    et
    85       6    cs    fi
    86      39    cs    fr
    87      27    cs    ht
    88      28    cs    hu
    89      30    cs    id
    90       2    cs    is
    91      30    cs    it
    92       5    cs    ja
    93      12    cs    lt
    94      26    cs    lv
    95      18    cs    nl
    96     790    cs  none
    97      77    cs    pl
    98      86    cs    pt
    99     366    cs    ru
    100   1497    cs    sk
    101     83    cs    sl
    102      2    cs    sv
    103     26    cs    tl
    104     16    cs    tr
    105      1    cs    uk
    106    186    cs   und
    107     60    cs    vi
    108      3    cs    zh

I'd like the similar counts to be clustered closer together in the following solution:

> Xmytable<-xtabs(mytable$count ~ mytable$lang1 + mytable$lang2, mytable)
> heatmap(Xmytable) 

Heatmap with poor clustering

So here are my questions:

1.) Is there an alternative way to manipulate this dataset to produce a heatmap with a color spectrum based on the count? (I'd like to create a heatmap similar to the one that I've shown)

2.) Can the clustering be improved to group similar colors near each other?

Thanks!

Was it helpful?

Solution 2

Here is the best option that I've found so far:

enter image description here

mytable<-read.delim("mytable.csv",sep=",",header=T)
mytable$ln<-log(mytable$count)
mytable#count<-NULL
mytable

"bio","twit","ln"
"ar","ar",13.7194907264167
"ar","bg",4.56434819146784
"ar","bo",4.29045944114839
"ar","chr",0.693147180559945
"ar","da",4.11087386417331
"ar","de",7.15617663748062
"ar","el",4.43081679884331
"ar","en",10.2423497879763
"ar","es",7.07157336421153
"ar","et",6.86901445066571
"ar","fa",10.1637341918018
"ar","fi",4.60517018598809
"ar","fr",6.63987583382654
"ar","he",2.89037175789616
"ar","hi",0
"ar","ht",6.94312242281943
"ar","hu",5.58724865840025
"ar","hy",2.83321334405622
"ar","id",8.10349427838097
"ar","is",3.13549421592915
"ar","it",5.5683445037611
"ar","iu",0
"ar","ja",5.57972982598622
"ar","ka",3.8286413964891
"ar","ko",5.99146454710798
"ar","lt",3.76120011569356
"ar","lv",5.07517381523383
"ar","my",0
"ar","nl",7.33888813383888
"ar","no",3.3322045101752
"ar","NONE",13.2327627765388
"ar","pl",6.22851100359118
"ar","pt",6.74170069465205
"ar","ru",6.3578422665081
"ar","sk",5.91079664404053
"ar","sl",5.73334127689775
"ar","sv",4.84418708645859
"ar","ta",0
"ar","th",2.19722457733622
"ar","tl",6.81454289725996
"ar","tr",6.37161184723186
"ar","uk",1.09861228866811
"ar","und",10.7549410519963
"ar","ur",8.77940359789435
"ar","vi",7.71646080017636
"ar","zh",2.83321334405622
"ca","ar",2.56494935746154
"ca","bg",0
"ca","da",3.29583686600433
"ca","de",4.60517018598809
"ca","en",6.85224256905188
"ca","es",9.08704215563169
"ca","et",4.02535169073515
"ca","fi",2.70805020110221
"ca","fr",6.81563999007433
"ca","ht",4.57471097850338
"ca","hu",4.15888308335967
"ca","id",4.56434819146784
"ca","is",2.07944154167984
"ca","it",6.32076829425058
"ca","ja",2.484906649788
"ca","ko",0.693147180559945
"ca","lt",2.56494935746154
"ca","lv",4.06044301054642
"ca","nl",3.85014760171006
"ca","no",1.79175946922805
"ca","NONE",8.95273476710687
"ca","pl",3.25809653802148
"ca","pt",6.93925394604151
"ca","ru",2.30258509299405
"ca","sk",4.12713438504509
"ca","sl",4.04305126783455
"ca","sv",3.46573590279973
"ca","tl",4.53259949315326
"ca","tr",3.66356164612965
"ca","und",5.61677109766657
"ca","vi",3.97029191355212
"cs","ar",2.63905732961526
"cs","bg",3.49650756146648
"cs","da",0
"cs","de",4.15888308335967
"cs","en",7.45529848568329
"cs","es",5.08759633523238
"cs","et",3.85014760171006
"cs","fi",1.79175946922805
"cs","fr",3.66356164612965
"cs","ht",3.29583686600433
"cs","hu",3.3322045101752
"cs","id",3.40119738166216
"cs","is",0.693147180559945
"cs","it",3.40119738166216
"cs","ja",1.6094379124341
"cs","lt",2.484906649788
"cs","lv",3.25809653802148
"cs","nl",2.89037175789616
"cs","NONE",6.67203294546107
"cs","pl",4.34380542185368
"cs","pt",4.45434729625351
"cs","ru",5.90263333340137
"cs","sk",7.31121838441963
"cs","sl",4.4188406077966
"cs","sv",0.693147180559945
"cs","tl",3.25809653802148
"cs","tr",2.77258872223978
"cs","uk",0
"cs","und",5.2257466737132
"cs","vi",4.0943445622221
"cs","zh",1.09861228866811


Xmytable<-xtabs(mytable$ln ~ mytable$lang1 + mytable$lang2, mytable)
library(pheatmap)
pheatmap(Xmytable, cluster_rows=T)

I'd like to add an option using ggplot(), which seems to require use of kmeans. However, I haven't been able to apply kmeans to this dataset due to the fact that I have non-numeric values, which is why the link shared above doesn't really answer the question for this situation (it's a useful link for heatmaps in general though).

OTHER TIPS

You can try this.

library(ggplot2)
ggplot(x, aes(x = lang1, y = lang2, fill = count)) + geom_bin2d()

For adding a dendrogram, consider this thread and/or post another question. Reproducing lattice dendrogram graph with ggplot2

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top