I have a large data set where each station has the same latitude and longitude. In the data set some rows are missing the lat and lon and instead say 'unknown'. I need to fill in the unknowns with the lat long from other stations where that data is not missing.

In this example I would want row 5 to have 3 and 8 inserted for lat and lon:

> station <- c("a","b","c","c","c")
> lat <- c("1","2","3","3","unknown")
> lon <- c("6","7","8","8","unknown")
> data.frame(station,lat,lon)
  station     lat     lon
1       a       1       6
2       b       2       7
3       c       3       8
4       c       3       8
5       c unknown unknown

There are a million rows in my data set, if it takes a couple minutes to complete that is fine since this only runs one time before the analysis starts. I'd prefer to not install another package unless really necessary.

有帮助吗?

解决方案

Something like this, perhaps -

df$station <- as.character(df$station)

unknownstations <- unique(subset(df,df$lat == "unknown","station"))
unknownstationscoords <- unique(subset(df,station %in% unknownstations$station & lat != "unknown"))

for( i in unknownstations$station)
{
df[df$station == i,"lat"] <- subset(unknownstationscoords,station %in% i,"lat")
df[df$station == i,"lon"] <- subset(unknownstationscoords,station %in% i,"lon")
}

其他提示

I'd use na.locf from zoo package. First, I'd change unknown to NA and then apply na.locf:

> library(zoo)
> df[ df=="unknown"] <- NA
> df2 <- do.call(rbind, lapply(split(df, df$station), na.locf))
> df2[, -1]  <- sapply(df2[, -1], as.numeric)  # numeric variables should be numeric 
> df2
    station lat lon
a         a   1   6
b         b   2   7
c.3       c   3   8
c.4       c   3   8
c.5       c   3   8

If you wanna chante the rownames, then use rownames and assign the names:

> rownames(df2) <- 1:nrow(df2)
> df2
  station lat lon
1       a   1   6
2       b   2   7
3       c   3   8
4       c   3   8
5       c   3   8
y=function(station,lat,lon){

  temp=cbind(station,lat,lon)
  lat_ind=lat!="unknown"
  lon_ind=lon!="unknown"


  if(all(lat_ind)==0){
    hash=unique(temp[lat_ind,])
    ind2=hash[,1]==station[!lat_ind]
    temp[!lat_ind,]=temp[ind2,]

    return(temp) 

  }else if(all(lon_ind)==0){
    hash=unique(temp[lon_ind,])
    ind2=hash[,1]==station[!lon_ind]
    temp[!lon_ind,]=temp[ind2,]

    return(temp)


  }else {

    return(temp)
  }


}




##case1

station <- c("a","b","c","c","c")
lat <- c("1","2","3","3","unknown")
lon <- c("6","7","8","8","unknown")

y(station,lat,lon)
# station lat lon
# [1,] "a"     "1" "6"
# [2,] "b"     "2" "7"
# [3,] "c"     "3" "8"
# [4,] "c"     "3" "8"
# [5,] "c"     "3" "8"


##case2

station <- c("a","b","c","c","c")
lat <- c("1","2","3","3","3")
lon <- c("6","7","8","8","unknown")
y(station,lat,lon)
# station lat lon
# [1,] "a"     "1" "6"
# [2,] "b"     "2" "7"
# [3,] "c"     "3" "8"
# [4,] "c"     "3" "8"
# [5,] "c"     "3" "8"


##case3

station <- c("a","b","c","c","c")
lat <- c("1","2","3","3","unknown")
lon <- c("6","7","8","8","8")
y(station,lat,lon)
# station lat lon
# [1,] "a"     "1" "6"
# [2,] "b"     "2" "7"
# [3,] "c"     "3" "8"
# [4,] "c"     "3" "8"
# [5,] "c"     "3" "8"
许可以下: CC-BY-SA归因
不隶属于 StackOverflow
scroll top