You can get corresponding SIP and DIP records on the same line through merge
:
df <- data.frame(
"UID" = c(720107626538, 720108826800),
"SIP" = c(1207697420, 3232248333),
"DIP" = c(3232248333, 1207697420),
"PROTOCOL" = c(17, 17),
"SPORT" = c(53, 47904),
"DPORT" = c(7722, 53),
stringsAsFactors = FALSE)
df_merged <- merge(
df[,setdiff(colnames(df), "DIP")],
df[,setdiff(colnames(df), "SIP")],
by.x = "SIP",
by.y = "DIP",
all = FALSE,
suffixes = c("_SIP", "_DIP"))
After that, you can use the UID fields to remove duplicates:
for(i in 2:nrow(df_merged)) {
ind <- df_merged$UID_DIP
ind[i] <- df_merged$UID_SIP[i]
df_merged <- df_merged[!duplicated(ind),]
}
df_merged
df_merged
SIP UID_SIP PROTOCOL_SIP SPORT_SIP DPORT_SIP UID_DIP PROTOCOL_DIP SPORT_DIP DPORT_DIP
1 1207697420 720107626538 17 53 7722 720108826800 17 47904 53
Because the de-duping relies on a loop, the whole thing could get very time-consuming if your dataset is large.