I am open to using arrays, collections or any other data type
As I see, using collection instead array would be much simplier:
Sub BuildInvertedIndex(pDict As Dictionary, pRange As Range)
Dim cell As Range
Dim words, word
Dim i As Long
' loop through cells (one col wide so same as looping through lines)
For Each cell In pRange.Cells
' loop through words in line
words = Split(cell.Value)
For Each word In words
If Not pDict.Exists(word) Then
' initialize collection
pDict.Add word, New Collection
End If
'try to add to collection. If row is already in collecton, nothing happend. Storing key makes you sure there're only unique rows
On Error Resume Next
pDict.Item(word).Add Item:=cell.Row, Key:=CStr(cell.Row)
On Error GoTo 0
Next word
Next cell
End Sub
Next step, is to slightly modify ArrayToString
to ColToString
:
Function ColToString(vCol As Collection, _
Optional vDelim As String = ",") As String
' only included to support test (be able to see what is in the arrays)
Dim vDelimString As String
Dim i As Long
For i = 1 To vCol.Count
vDelimString = vDelimString & CStr(vCol.Item(i)) & _
IIf(i < vCol.Count, vDelim, "")
Next
ColToString = vDelimString
End Function
and the test subroutine (changed only one row - Debug.Print k & ": " & ColToString(vDict.Item(k))
and target range to "F2:F5"
):
Sub Test()
' minimum included here to demonstrate use of buildInvertedIndex procedure
Dim vRange As Range
Dim vDict As Dictionary
Set vRange = ActiveSheet.Range("F2:F5")
Set vDict = New Dictionary
BuildInvertedIndex vDict, vRange
' test values returned in dictionary (word: [line 1, ..., line n])
Dim k As Variant, vCounter As Long
vCounter = 0
For Each k In vDict.Keys
Debug.Print k & ": " & ColToString(vDict.Item(k))
vCounter = vCounter + 1
If vCounter >= 10 Then
Exit For
End If
Next
'clean up memory
Set vDict = Nothing
End Sub
RESULT:
UPDATE:
to improve speed of your code you could store range in array (next approach work only with single-column range, but you could easily modify it):
Test sub:
Sub TestWirhArray()
' minimum included here to demonstrate use of buildInvertedIndex procedure
Dim vRange As Range
Dim vDict As Dictionary
Dim myArr As Variant
Set vDict = New Dictionary
Set vRange = ActiveSheet.Range("F2:F20585")
myArr = vRange.Value
BuildInvertedIndexWithArr vDict, myArr, vRange.Row
' test values returned in dictionary (word: [line 1, ..., line n])
Dim k As Variant, vCounter As Long
vCounter = 0
For Each k In vDict.Keys
Debug.Print k & ": " & ColToString(vDict.Item(k))
vCounter = vCounter + 1
If vCounter >= 10 Then
Exit For
End If
Next
'clean up memory
Set vDict = Nothing
End Sub
new version of BuildInvertedIndexWithArr
:
Sub BuildInvertedIndexWithArr(pDict As Dictionary, pArr, firstRow As Long)
Dim cell, words, word
Dim i As Long, j As Long
j = firstRow
' loop through cells (one col wide so same as looping through lines)
For Each cell In pArr
' loop through words in line
words = Split(cell)
For Each word In words
If Not pDict.exists(word) Then
' initialize collection
pDict.Add word, New Collection
End If
On Error Resume Next
pDict.Item(word).Add Item:=j, Key:=CStr(j)
On Error GoTo 0
Next word
j = j + 1
Next cell
End Sub