Question

Is there an easy way of determining the most frequently occuring word in a column/field using T-SQL or VBA?

I am working on a fuzzy matching system for two given recordsets and would like to produce a matching string where the most frequently occuring words are removed. As the data is from a customer relations management database terms like "limited", "ltd", "plc" and "CORPORATION" would be removed.

Was it helpful?

Solution

Written for sql-server 2005+

Function to split:

create function f_split
(
  @a varchar(max), 
  @delimiter varchar(20)
)
RETURNS @t TABLE(substr varchar(200))
as
begin
set @a = @a + @delimiter
;with a as
(
  select cast(1 as bigint) f1, charindex(@delimiter, @a) f2
  where len(@a) > 0
  union all
  select f2 + (len(@delimiter)) + 1, charindex(@delimiter, @a, f2+1)
  from a
  where f2 > 0
)
insert @t
select substring(@a, f1, f2 - f1) from a
where f1 < f2
return
end
go

Query:

--testdata
declare @table table(name varchar(50))

insert @table values('bla bla bla ltd')
insert @table values('bla plc ltd')
insert @table values('more text CORPORATION')


declare @matchlist table(name varchar(50), replacement varchar(50))
insert @matchlist values('ltd', 'limited')
insert @matchlist values('plc', 'limited')
insert @matchlist values('CORPORATION', 'limited')

--query
select coalesce(m.replacement, a.substr) name, count(*) count from @table p
cross apply
(
  select substr from 
  dbo.f_split(p.name, ' ')
) a
left join
@matchlist m
on a.substr = m.name
group by coalesce(m.replacement, a.substr)
order by 2 desc

Result:

name  count
----  -----
bla       4
limited   4
more      1
text      1

OTHER TIPS

Hope this will be useful to you.

   create table sometable
    ( id integer not null primary key identity
    , mYWords text not null
    );
    insert into sometable (mYWords) 
    values ('a word that appears maximum number of times in a column')
    insert into sometable (mYWords) 
    values ('Is it possible to get words from text columns in a sql server database')
    insert into sometable (mYWords) 
    values ('This could solve my problem if reffered column contain only single word')
    insert into sometable (mYWords) 
    values ('that''s going to require that you split out every word in the column individually')
    insert into sometable (mYWords) 
    values ('the query will definitely not be easy to write')
    insert into sometable (mYWords) 
    values ('Please read the sticky at the top of the board')
    insert into sometable (mYWords) 
    values ('The physical order of data in a database has no meaning')

GO

CREATE TABLE WordList (
      Word varchar(256)
    , WordId int IDENTITY(1,1)
    , Add_Dt datetime DEFAULT (GetDate()))
 GO

CREATE UNIQUE INDEX UnqueWords_PK ON WordList(Word)
GO

CREATE PROC isp_INS_WORD_LIST
AS
BEGIN
    SET NOCOUNT ON
    DECLARE @Words INT, @Pos INT, @x Int, @str varchar(256)
          , @word varchar(256), @start int, @end int, @exitstart int
    SELECT @Words = 0, @Pos = 1, @x = -1, @Word = '', @start = 1

    DECLARE Cur1 CURSOR FOR SELECT mYWords FROM sometable
    OPEN Cur1
    FETCH NEXT FROM Cur1 INTO @str

    WHILE @@FETCH_STATUS = 0
      BEGIN
        WHILE (@x <> 0)
            BEGIN
                SET @x     = CHARINDEX(' ', @str, @Pos)
                IF @x <> 0
                  BEGIN 
                    SET @end   = @x - @start
                    SET @word  = SUBSTRING(@str,@start,@end)
                    IF NOT EXISTS (SELECT * FROM WordList WHERE Word = @Word)
                        INSERT INTO WordList(Word) SELECT @word
                    -- SELECT @Word, @@ROWCOUNT,@@ERROR
                    -- SELECT @x, @Word, @start, @end, @str
                    SET @exitstart = @start + @end + 1
                    SET @Pos   = @x + 1
                    SET @start = @x + 1
                    SET @Words = @Words + 1
                  END
                IF @x = 0
                  BEGIN
                    SET @word  = SUBSTRING(@str,@exitstart,LEN(@str)-@exitstart+1)
                    IF NOT EXISTS (SELECT * FROM WordList WHERE Word = @Word)
                        INSERT INTO WordList(Word) SELECT @word
                    -- SELECT @Word, @@ROWCOUNT,@@ERROR
                    -- SELECT @x, @Word, @exitstart, LEN(@str)-@exitstart, @str
                  END
            END
        FETCH NEXT FROM Cur1 INTO @str
        SELECT @Words = 0, @Pos = 1, @x = -1, @Word = '', @start = 1
      END   

      CLOSE Cur1
      DEALLOCATE Cur1
      SET NOCOUNT OFF
    RETURN @Words
END
GO

EXEC isp_INS_WORD_LIST
GO

SELECT * FROM WordList ORDER BY Word
GO

DROP PROC isp_INS_WORD_LIST
DROP TABLE WordList, sometable
GO
Licensed under: CC-BY-SA with attribution
Not affiliated with StackOverflow
scroll top