merge multiple dbf files with non-matching headers in R

https://stackoverflow.com/questions/22331853

13-06-2023
|

Question

I have over 800 dbf files which I need to import and merge in R. I have been able to bring in all of the files using this code:

library(foreign)
setwd("c:/temp/help/")

files <- list.files(pattern="\\.dbf$")
all.the.data <- lapply(files, read.dbf, as.is=FALSE)
DATA <- do.call("rbind",all.the.data)

However, these dbf files have different numbers of columns and even if they sometimes have the same number of columns, those headers may be different. Here are four of the dbf files to provide an example:

file01 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1002_2km", class = "factor"), 
                         VALUE_11 = 11443500, VALUE_31 = 13500, VALUE_42 = 928800, 
                         VALUE_43 = 162000, VALUE_90 = 18900), .Names = c("PLOTBUFFER", 
                                                                          "VALUE_11", "VALUE_31", "VALUE_42", "VALUE_43", "VALUE_90"), row.names = c(NA, 
                                                                                                                                                     -1L), class = "data.frame", data_types = c("C", "F", "F", "F", 
                                                                                                                                                                                                "F", "F"))
file02 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1002_5km", class = "factor"), 
                         VALUE_11 = 66254400, VALUE_21 = 125100, VALUE_31 = 80100, 
                         VALUE_41 = 4234500, VALUE_42 = 3199500, VALUE_43 = 4194000, 
                         VALUE_52 = 376200, VALUE_90 = 72000), .Names = c("PLOTBUFFER", 
                                                                          "VALUE_11", "VALUE_21", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43", 
                                                                          "VALUE_52", "VALUE_90"), row.names = c(NA, -1L), class = "data.frame", data_types = c("C", 
                                                                                                                                                                "F", "F", "F", "F", "F", "F", "F", "F"))
file03 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1003_2km", class = "factor"), 
                         VALUE_11 = 1972800, VALUE_31 = 125100, VALUE_41 = 5316300, 
                         VALUE_42 = 990900, VALUE_43 = 1995300, VALUE_52 = 740700, 
                         VALUE_90 = 1396800, VALUE_95 = 25200), .Names = c("PLOTBUFFER", 
                                                                           "VALUE_11", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43", "VALUE_52", 
                                                                           "VALUE_90", "VALUE_95"), row.names = c(NA, -1L), class = "data.frame", data_types = c("C", 
                                                                                                                                                                 "F", "F", "F", "F", "F", "F", "F", "F"))
file04 <- structure(list(PLOTBUFFER = structure(1L, .Label = "1003_5km", class = "factor"), 
                         VALUE_11 = 43950600, VALUE_31 = 270000, VALUE_41 = 12969900, 
                         VALUE_42 = 5105700, VALUE_43 = 12614400, VALUE_52 = 1491300, 
                         VALUE_90 = 2055600, VALUE_95 = 70200), .Names = c("PLOTBUFFER", 
                                                                           "VALUE_11", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43", "VALUE_52", 
                                                                           "VALUE_90", "VALUE_95"), row.names = c(NA, -1L), class = "data.frame", data_types = c("C", 
                                                                                                                                                                 "F", "F", "F", "F", "F", "F", "F", "F"))

I would like the dataframe to match this:

merged <- structure(list(PLOTBUFFER = structure(1:2, .Label = c("1002_2km", 
    "1002_5km"), class = "factor"), VALUE_11 = c(11443500, 66254400
    ), VALUE_21 = c(0, 125100), VALUE_31 = c(13500, 80100), VALUE_41 = c(0, 
    4234500), VALUE_42 = c(928800, 3199500), VALUE_43 = c(162000, 
    4194000), VALUE_52 = c(0, 376200), VALUE_90 = c(18900, 72000)), .Names = c("PLOTBUFFER", 
    "VALUE_11", "VALUE_21", "VALUE_31", "VALUE_41", "VALUE_42", "VALUE_43", 
    "VALUE_52", "VALUE_90"), class = "data.frame", row.names = c(NA, 
    -2L))

Where if there is a missing column from one dataset it simply is filled in with a zero or NULL.

Thanks

-al

The suggestion by @infominer worked for the 4 files I included as an example but when I tried to use merge_recurse on the large list of 802 elements, I received an error.

files <- list.files(pattern="\\.dbf$")
all.the.data <- lapply(files, read.dbf, as.is=FALSE)
merged <- merge_recurse(all.the.data)

Error: evaluation nested too deeply: infinite recursion / options(expressions=)? Error during wrapup: evaluation nested too deeply: infinite recursion / options(expressions=)?

Solution

Use the package reshape

library(reshape)
merged.files <-merge_recurse(list(file01,file02,file03,file04))

Edit:

Try this code thanks to Ramnath

Reduce(function(...) merge(..., all=T),all.the.data)

adapted from https://stackoverflow.com/a/6947326/2747709

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow