Creating test load for self-referencing table

Question 1

I ended up using my original aproach with some tweaks:

disabling reference constraint before insert and re-enabling afterwards
using batch inserts as Mitch Wheat suggested

This is the schema:

DROP TABLE Test2
GO

CREATE TABLE Test2 (
    ID int IDENTITY(1,1) PRIMARY KEY CLUSTERED NOT NULL,
    ParentID int NULL /*REFERENCES Test2 (ID)*/
)
GO

ALTER TABLE Test2 
  ADD CONSTRAINT FK_SelfRef
    FOREIGN KEY(ParentID) REFERENCES Test2 (ID)
GO

And the script:

CHECKPOINT;
DBCC DROPCLEANBUFFERS;

SET NOCOUNT ON

ALTER TABLE Test2 NOCHECK CONSTRAINT FK_SelfRef

INSERT INTO Test2 (ParentID)
VALUES (NULL)

DECLARE @n int = 1

;WHILE(1=1)
BEGIN
  INSERT INTO Test2 (ParentID)
  SELECT ID FROM Test2 ORDER BY NEWID()

  SELECT @n = COUNT(*) FROM Test2

  IF(@n >= 999999)
    BREAK
END

ALTER TABLE dbo.Test2 WITH CHECK CHECK CONSTRAINT FK_SelfRef

SET NOCOUNT OFF

This executes in 10 secs, and I can't do it this fast with any other method.

NOTE: It inserts more records than needed. But the method can be arranged to insert exact no of records by limiting number of inserts in the last pass.

Question 2

You are not really measuring the INSERT performance with your posted code.

Picking a single random row using an ORDER BY clause like this:

SELECT TOP 1 * FROM table ORDER BY NEWID()

or even

SELECT TOP 1 * FROM table ORDER BY CHECKSUM(NEWID())

performs a table scan (because the random value associated with each row obviously needs to be calculated before the rows can be ordered), which can be slow for large tables. Using an indexed integer column (such as that commonly used for a primary key), and using:

SELECT TOP 1 * FROM table 
WHERE rowid >= RAND(CHECKSUM(NEWID())) * (SELECT MAX(rowid) FROM table)

works in constant time, provided the rowid column is indexed. Note: this assumes that rowid is uniformly distributed in the range 0..MAX(rowid). If your dataset has some other distribution, your results will be skewed (i.e. some rows will be picked more often than others).

Question 3

When parent is assigned randomly from the previously inserted rows, there is no control over the tree height (number of levels) and the way levels populated, which may not be desired in some scenarios.

It may be more convenient to populate tree with a data level by level.

Auxiliary table valued function is taken to generate numbers sequence using Itzik's cross joined CTE method (see e.g. here about it)

create function ftItziksCJCTE
(
    @cnt int
)
returns table as
return
(
    WITH
        E00(N) AS (SELECT 1 UNION ALL SELECT 1),
        E02(N) AS (SELECT 1 FROM E00 a, E00 b),
        E04(N) AS (SELECT 1 FROM E02 a, E02 b),
        E08(N) AS (SELECT 1 FROM E04 a, E04 b),
        E16(N) AS (SELECT 1 FROM E08 a, E08 b),
        E32(N) AS (SELECT 1 FROM E16 a, E16 b),
        E(N) AS (SELECT ROW_NUMBER() OVER (ORDER BY N) FROM E32)
    select N from E where N <= @cnt
)

Simple table to control elements distribution in the tree:

create table #TreeLevels
(
    LevelNo int identity(1, 1) not NULL,
    MinElements int not NULL,
    MaxElements int not NULL,
    primary key clustered (LevelNo)
)

Sample distribution:

insert into #TreeLevels values (7, 10)
insert into #TreeLevels values (70, 100)
insert into #TreeLevels values (700, 1000)

Will give us something like 7 to 10 elements with ParentID = NULL, each of them having something like 70 to 100 elements, etc. With total number of elements 343000 to 1000000

Or other distribution:

insert into #TreeLevels values (1, 1)
insert into #TreeLevels values (9, 15)
insert into #TreeLevels values (10, 12)
insert into #TreeLevels values (9, 15)
insert into #TreeLevels values (10, 12)
insert into #TreeLevels values (9, 15)
insert into #TreeLevels values (10, 12)

Meaning there will be single root element with something between 9 and 15 child elements, each of them having something like 10 to 12 elements, etc.

Then tree can be populated level by level:

declare @levelNo int, @eMin int, @eMax int

create table #Inserted (ID int not NULL, primary key nonclustered (ID))
create table #Inserted2 (ID int not NULL, primary key nonclustered (ID))

set @levelNo = 1
while 1=1
begin
    select @eMin = MinElements, @eMax = MaxElements from #TreeLevels where LevelNo = @levelNo

    if @@ROWCOUNT = 0
        break

    if @levelNo = 1
    begin
        insert into TestTree (ParentID)
        output inserted.ID into #Inserted (ID)
        select NULL from ftItziksCJCTE(round(rand(checksum(newid())) * (@eMax - @eMin) + @eMin, 0))
    end
    else
    begin
        if exists (select 1 from #Inserted)
        begin
            insert into TestTree (ParentID)
            output inserted.ID into #Inserted2 (ID)
            select
                I.ID
            from
                #Inserted I
                cross apply ftItziksCJCTE(round(rand(checksum(newid())) * (@eMax - @eMin) + @eMin, 0)) F

            truncate table #Inserted
        end
        else
        begin
            insert into TestTree (ParentID)
            output inserted.ID into #Inserted (ID)
            select
                I.ID
            from
                #Inserted2 I
                cross apply ftItziksCJCTE(round(rand(checksum(newid())) * (@eMax - @eMin) + @eMin, 0)) F

            truncate table #Inserted2
        end
    end

    set @levelNo = @levelNo + 1
end

However, there is no control on the exact number of elements the tree will contain and leaf nodes are on the last level only. It would be good to have additional parameter controlling level population (percent of nodes on the same level which will have children).

create table #TreeLevels
(
    LevelNo int identity(1, 1) not NULL,
    MinElements int not NULL,
    MaxElements int not NULL,
    PopulatedPct float NULL,
    primary key clustered (LevelNo)
)

Sample distribution:

insert into #TreeLevels values (1, 1, NULL)
insert into #TreeLevels values (9, 15, NULL)
insert into #TreeLevels values (10, 12, NULL)
insert into #TreeLevels values (9, 15, 80)
insert into #TreeLevels values (10, 12, 65)
insert into #TreeLevels values (9, 15, 35)
insert into #TreeLevels values (10, 12, NULL)

NULL for a PopulatedPct percent is treated as 100%. PopulatedPct controls next level population and should be taken from previous level during cycle. Also it has no meaning for the last row in the #TreeLevels hence.

Now we can cycle trough levels taking PopulatedPct into account.

declare @levelNo int, @eMin int, @eMax int

create table #Inserted (ID int not NULL, primary key nonclustered (ID))
create table #Inserted2 (ID int not NULL, primary key nonclustered (ID))

set @levelNo = 1
while 1=1
begin
    select @eMin = MinElements, @eMax = MaxElements from #TreeLevels where LevelNo = @levelNo

    if @@ROWCOUNT = 0
        break

    if @levelNo = 1
    begin
        insert into TestTree (ParentID)
        output inserted.ID into #Inserted (ID)
        select NULL from ftItziksCJCTE(round(rand(checksum(newid())) * (@eMax - @eMin) + @eMin, 0))
    end
    else
    begin
        declare @pct float
        select @pct = PopulatedPct from #TreeLevels where LevelNo = @levelNo - 1

        if exists (select 1 from #Inserted)
        begin
            if (@pct is NULL)
                insert into TestTree (ParentID)
                output inserted.ID into #Inserted2 (ID)
                select
                    I.ID
                from
                    #Inserted I
                    cross apply ftItziksCJCTE(round(rand(checksum(newid())) * (@eMax - @eMin) + @eMin, 0)) F
            else
                insert into TestTree (ParentID)
                output inserted.ID into #Inserted2 (ID)
                select
                    I.ID
                from
                    (select top (@pct) PERCENT ID from #Inserted order by rand(checksum(newid()))) I
                    cross apply ftItziksCJCTE(round(rand(checksum(newid())) * (@eMax - @eMin) + @eMin, 0)) F

            truncate table #Inserted
        end
        else
        begin
            if (@pct is NULL)
                insert into TestTree (ParentID)
                output inserted.ID into #Inserted (ID)
                select
                    I.ID
                from
                    #Inserted2 I
                    cross apply ftItziksCJCTE(round(rand(checksum(newid())) * (@eMax - @eMin) + @eMin, 0)) F
            else
                insert into TestTree (ParentID)
                output inserted.ID into #Inserted (ID)
                select
                    I.ID
                from
                    (select top (@pct) PERCENT ID from #Inserted2 order by rand(checksum(newid()))) I
                    cross apply ftItziksCJCTE(round(rand(checksum(newid())) * (@eMax - @eMin) + @eMin, 0)) F

            truncate table #Inserted2
        end
    end

    set @levelNo = @levelNo + 1
end

Still there is no control over the exact number of elements, but better control over the tree shape is gained.