Question

I am using EF6 as the Load layer of an ETL tool. I acknowledge there are better tools (such as SSIS, direct SQL queries, etc) however due to the complexity of the transformation, it needed to be done in code, and the target DB is built from an EF Model. The number of records inserted in a batch can exceed 100,000 records. This is not incredibly slow to do (using the DBContext.AddRange() method) but the memory usage is extremely high (exceeding 1GB)

For examples sake, i have the following data classes (which are created in memory)

    public class Foo
    {
        public long FooID { get; set; }
        public string SomeProperty { get; set; }
        public decimal AverageFlightSpeedOfUnladenSwallow { get; set; }
        public IEnumerable<Bar> Bars { get; set; }
    }

    public class Bar
    {
        public long BarID { get; set; }
        public Foo Foo { get; set; }
        public long FooID { get; set; }
        public string FavoriteColour { get; set; }
    }
dbContext.Foos.AddRange(ListOfFoos); //Pre constructed list of Foos
dbContext.Bars.AddRange(ListOfBars); //Pre constructed list of Bars (parent Foo items populated, FooID is not)
dbContext.SaveChanges();

I am looking at using the LINQ Entity Data reader to enable the conversion of IList<Foo> to a data reader so i can import it using SQLBulkCopy (SqlBulkCopy and Entity Framework, http://archive.msdn.microsoft.com/LinqEntityDataReader/Release/ProjectReleases.aspx?ReleaseId=389).

Requirement
The List<Bar> will not have the ID's of the parent Foo class. Entity framework handles this just fine, but i am not sure how to get this same functionality in SqlBulkCopy. Is there some way to get it done?

Was it helpful?

Solution 2

So,

Provided your EF user has the ability to alter schema on the database, you can pursue this method of solving the problem:

  1. Add a GUID column to the table
  2. Identify each object in memory by the GUID
  3. Insert the values via bulk Insert including the identifying GUID
  4. Select them back and map them to the inserted GUIDs
  5. Drop the GUID column

Here is some code to do just that. Its a little dirty and not optimised, but it minimised the original task down to 30MB of memory and 1 minute to process

public static class ForeignKeyBulkInsert
{
    private const string GUID_COLUMN_NAME = "GUID_SURROGATE_KEY";

    public static string GetTableName<T>(this ObjectContext context) where T : class
    {
        string sql = context.CreateObjectSet<T>().ToTraceString();
        Regex regex = new Regex("FROM (?<table>.*) AS");
        Match match = regex.Match(sql);

        string table = match.Groups["table"].Value;
        return table;
    }

    public static void AddRange<TEntity>(this DbContext db, IEnumerable<TEntity> range, bool importForeignKeyIDs = false)
        where TEntity : class
    {
        Dictionary<Guid, TEntity> lookup = new Dictionary<Guid, TEntity>();

        var objectContext = ((IObjectContextAdapter)db).ObjectContext;
        var os = objectContext.CreateObjectSet<TEntity>();
        bool hasAutoGeneratedKey = os.EntitySet.ElementType.KeyProperties.Any();
        Type entityType = typeof(TEntity);

        if (importForeignKeyIDs)
        {
            var foreignKeyProperties = os.EntitySet.ElementType.NavigationProperties.Where(x => x.ToEndMember.RelationshipMultiplicity == RelationshipMultiplicity.One);
            foreach (var foreignKeyProperty in foreignKeyProperties)
            {
                var foreignKeyIdProperty = foreignKeyProperty.GetDependentProperties().First();
                var parentKeyProperty = foreignKeyProperty.ToEndMember.GetEntityType().KeyMembers.First();
                PropertyInfo foreignKeyPropertyInfo = null;
                Type parentType = null;
                PropertyInfo parentKeyPropertyInfo = null;
                PropertyInfo foreignKeyIdPropertyInfo = null;
                foreach (var item in range)
                {
                    entityType.GetProperty(foreignKeyProperty.Name).GetValue(item);
                    if (foreignKeyPropertyInfo == null)
                        foreignKeyPropertyInfo = entityType.GetProperty(foreignKeyProperty.Name);
                    if (parentType == null)
                        parentType = foreignKeyPropertyInfo.GetValue(item).GetType();
                    if (parentKeyPropertyInfo == null)
                        parentKeyPropertyInfo = parentType.GetProperty(parentKeyProperty.Name);
                    if (foreignKeyIdPropertyInfo == null)
                        foreignKeyIdPropertyInfo = entityType.GetProperty(foreignKeyIdProperty.Name);

                    var foreignKey = foreignKeyPropertyInfo.GetValue(item);
                    if (foreignKey == null)
                        break;

                    var parentKey = parentKeyPropertyInfo.GetValue(foreignKey);
                    foreignKeyIdPropertyInfo.SetValue(item, parentKey);
                }
            }
        }

        string tableName = objectContext.GetTableName<TEntity>();
        var entityReader = range.AsDataReader(GUID_COLUMN_NAME, lookup);

        if (hasAutoGeneratedKey)
        {
            try
            {
                db.Database.ExecuteSqlCommand(string.Format("ALTER TABLE {0} ADD [{1}] uniqueidentifier null", tableName, GUID_COLUMN_NAME));
            }
            catch (Exception)
            {
                db.Database.ExecuteSqlCommand(string.Format("ALTER TABLE {0} DROP COLUMN [{1}]", tableName, GUID_COLUMN_NAME));
                db.Database.ExecuteSqlCommand(string.Format("ALTER TABLE {0} ADD [{1}] uniqueidentifier null", tableName, GUID_COLUMN_NAME));
            }
        }
        try
        {
            var connection = db.Database.Connection as SqlConnection;
            connection.Open();
            using (SqlBulkCopy cpy = new SqlBulkCopy(connection))
            {
                cpy.BulkCopyTimeout = 0;
                cpy.DestinationTableName = tableName;
                cpy.WriteToServer(entityReader);
                connection.Close();
            }

            if (hasAutoGeneratedKey)
            {
                db.Database.Connection.Open();
                var comm = db.Database.Connection.CreateCommand();
                comm.CommandText = string.Format("SELECT * FROM {0} WHERE [{1}] is not null", tableName, GUID_COLUMN_NAME);
                try
                {
                    using (var reader = comm.ExecuteReader())
                    {
                        while (reader.Read())
                        {
                            Guid surrogateKey = Guid.Parse(reader[GUID_COLUMN_NAME].ToString());
                            TEntity entity = lookup[surrogateKey];
                            var keyProperty = entityType.GetProperty(os.EntitySet.ElementType.KeyMembers.First().Name);
                            keyProperty.SetValue(entity, reader[keyProperty.Name]);
                        }
                    }
                }
                catch (Exception)
                {
                    throw;
                }
                finally
                {
                    //This should never occur
                    db.Database.Connection.Close();
                }
            }
        }
        catch (Exception)
        {
            throw;
        }
        finally
        {
            if (hasAutoGeneratedKey)
                db.Database.ExecuteSqlCommand(string.Format("ALTER TABLE {0} DROP COLUMN [{1}]", tableName, GUID_COLUMN_NAME));
        }

    }
}

OTHER TIPS

No, there is not direct way to do this with SQL bulkcopy.

SQL Bulkcopy is very close to the database, therefore it is very fast. The ORM handles the FK/PK relations but has a disadvantage of being slow.

Depending on your datamodel, you could do something like it is in this question: populate batches of datatables

SQL Bulkcopy YYYYMMDD problem

Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top