Method
The steps to do this involves:
- Transforming the data set
- Creating a song / song data frame
- Calculating the cosine similarity for each cell
Transformation
To transform the dataset we use the plyr library to get subset where Country is Germany (I was only interested in Germany) library(plyr) data.germany<-(data[data$country %in% "Germany",])
We then want to create a frequency table and we only want songs that users listened to
germany.frequency<-as.data.frame((table(data.germany$user,data.germany$artist,dnn=c("user","artist"))))
germany.frequency<-subset(germany.frequency,Freq>=1)
We then populate a binary matrix where 1 represents a song listened to by a user
germany.users <- as.matrix(unique(data.germany$user))
germany.artists <- as.matrix(unique(data.germany$artist))
holder <- matrix(NA, nrow=nrow(germany.users),ncol=nrow(germany.artists),dimnames=list((germany.users),(germany.artists)))
holder[,] <- 0
for(i in 1:nrow(holder)) {
for(j in 1:ncol(holder)) {
if(nrow(subset(germany.frequency, (user == rownames(holder)[i] & artist == colnames(holder)[j])))>0)
{ holder[i,j]<-1 }
}
}
# Reorder the column names alphabetically
data.germany<-(holder[,order(colnames(holder))])
We now have our holder matrix read. Note: For loops take a long time in R.
Item Based Similarity
Drop the user column and make a new data frame
data.germany.ibs <- (data.germany[,!(names(data.germany) %in% c("user"))])
Create a helper function to calculate the cosine between two vectors
getCosine <- function(x,y)
{
this.cosine <- sum(x*y) / (sqrt(sum(x*x)) * sqrt(sum(y*y)))
return(this.cosine)
}
Create a placeholder dataframe listing item vs. item
holder <- matrix(NA, nrow=ncol(data.germany.ibs),ncol=ncol(data.germany.ibs),dimnames=list(colnames(data.germany.ibs),colnames(data.germany.ibs)))
data.germany.ibs.similarity <- as.data.frame(holder)
Lets fill in those empty spaces with cosine similarities
for(i in 1:ncol(data.germany.ibs)) {
for(j in 1:ncol(data.germany.ibs)) {
data.germany.ibs.similarity[i,j]= getCosine(data.germany.ibs[i],data.germany.ibs[j])
}
}
Output similarity results to a file
write.csv(data.germany.ibs.similarity,file="final-germany-similarity.csv")
Get the top 10 neighbours for each
data.germany.neighbours <- matrix(NA, nrow=ncol(data.germany.ibs.similarity),ncol=11,dimnames=list(colnames(data.germany.ibs.similarity)))
for(i in 1:ncol(data.germany.ibs))
{
data.germany.neighbours[i,] <- (t(head(n=11,rownames(data.germany.ibs.similarity[order(data.germany.ibs.similarity[,i],decreasing=TRUE),][i]))))
}
Output neighbour results to a file
write.csv(file="final-germany-item-neighbours.csv",x=data.germany.neighbours[,-1])