Here is one way with GNU awk
:
awk '{
header[$2]++;
bacteria[$1]++;
map[$1,$2]++
}
END {
x=asorti(header,header_s);
for(i=1;i<=x;i++) {
printf "\t%s\t", header_s[i]
}
print ""
y=asorti(bacteria,bacteria_s);
for(j=1;j<=y;j++) {
printf "%s\t\t", bacteria_s[j];
for (z=1;z<=x;z++) {
printf "%s\t\t\t\t", (map[bacteria_s[j],header_s[z]])?"1":"0"
}
print ""
}
}' file
protein:plasmid:147856 protein:plasmid:149679 protein:proph:183386
bacteria_1 0 1 1
bacteria_2 0 0 1
bacteria_3 1 0 1
Here is a solution with regular awk
:
awk '
!is_present[$1]++ {bacteria[++x] = $1}
!is_present[$2]++ {protein[++y] = $2}
{map[$1,$2]++}
END {
for(i=1; i<=y; i++) {
printf "\t%s\t", protein[i]
}
print "";
for(j=1; j<=x; j++) {
printf "%s\t\t", bacteria[j];
for(a=1; a<=y; a++) {
printf "%s\t\t\t\t", (map[bacteria[j], protein[a]])?"1":"0"
}
print ""
}
}' file