In Oracle what is the fastest way to limit the characters in a string?

Question 1

You can try this approach, which looks to be much faster based on some (very) loose tests. Its a function that's compiled natively:

CREATE OR REPLACE function clean_string(
    in_string    in varchar2)
 return varchar2 AS
   out_string varchar2(4000) := '';
   in_length number;
   cnt number := 0;
   in_char char(1);
   out_char char(1);
   dec_char number;
   prev_space boolean := false;
begin
   --dbms_output.put_line('In string: ' || in_string);
   in_length := LENGTH(in_string);
   while cnt < in_length
   LOOP
     cnt := cnt + 1;
     in_char := substr(in_string, cnt, 1);
     dec_char := ascii(in_char);
     -- blank out non alphanumerics
     IF (
       (dec_char >= 48 AND dec_char <= 57) OR
       (dec_char >= 65 AND dec_char <= 90) OR
       (dec_char >= 97 AND dec_char <= 122)
       ) THEN
         --keep it
         out_char := in_char;
     ELSE
       out_char := ' ';
     END IF;

     IF (NOT(prev_space AND out_char = ' ')) THEN
       out_string := out_string || out_char;
     END IF;

     <<endloop>>
     IF (out_char = ' ') THEN
       prev_space := true;
     ELSE
       prev_space := false;
     END IF;

   END LOOP;
   return trim(upper(out_string));
end;

ALTER SESSION SET PLSQL_CODE_TYPE=NATIVE;
ALTER function clean_string COMPILE;

And to test, I pulled 5 million rows from a table and cleaned some strings:

set serveroutput on
declare
    cursor sel_cur1 is
    select name, clean_string(name) as cln_name,
        address1, clean_string(address1) as cln_addr1,
        address2, clean_string(address2) as cln_addr2,
        city, clean_string(city) as cln_city,
        state, clean_string(state) as cln_state,
        postalcode, clean_string(postalcode) as cln_zip
    from my_table
    where rownum <= 5000000;

    cursor sel_cur2 is
    select name,
        address1,
        address2,
        city,
        state,
        postalcode
    from my_table
    where rownum <= 5000000;

    l_cnt integer := 0;
    l_cln_name varchar2(100);
    l_cln_addr1 varchar2(100);
    l_cln_addr2 varchar2(100);
    l_cln_city varchar2(100);
    l_cln_state varchar2(100);
    l_cln_zip varchar2(100);

    l_interval interval day to second(4);
    l_start timestamp;
    l_end timestamp;
begin
   l_start := systimestamp;
   for rec in sel_cur2
   loop
         l_cnt := l_cnt + 1;
         l_cln_name := clean_string(rec.name);
         l_cln_addr1 := clean_string(rec.address1);
         l_cln_addr2 := clean_string(rec.address2);
         l_cln_city := clean_string(rec.city);
         l_cln_state := clean_string(rec.state);
         l_cln_zip := clean_string(rec.postalcode);
   end loop;
    l_end := systimestamp;
    l_interval := l_end - l_start;
    dbms_output.put_line('Procedural approach timing: ' || l_interval);
   -------------------------------------------------
   l_cnt := 0;
   l_start := systimestamp;
   for rec in sel_cur1
   loop
         -- cleaning already done in SQL
         l_cnt := l_cnt + 1;
   end loop;
   l_end := systimestamp;
   l_interval := l_end - l_start;
   dbms_output.put_line('SQL approach timing: ' || l_interval);

   -------------------------------------------------
   l_cnt := 0;
   l_start := systimestamp;
   for rec in sel_cur2
   loop
         l_cnt := l_cnt + 1;
         l_cln_name := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.name, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
         l_cln_addr1 := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.address1, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
         l_cln_addr2 := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.address2, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
         l_cln_city := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.city, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
         l_cln_state := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.state, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
         l_cln_zip := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(rec.postalcode, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
   end loop;
   l_end := systimestamp;
   l_interval := l_end - l_start;
   dbms_output.put_line('Existing approach timing: ' || l_interval);
end;

And the output was:

Procedural approach timing: +00 00:02:04.0320
SQL approach timing: +00 00:02:49.4326
Existing approach timing: +00 00:05:50.1607

Also, the native compilation seems to only help a procedural approach to the processing (rather than calling the function from a SQL query), but appears to be much faster than the regexp_replace solution. Hope that helps.

Question 2

First, let me say that I am not really answering my own question, but I am accepting tbone's answer. The reason for providing this answer, is the comments don't let me post what I really want.

I created a function almost identical to tbone's with a couple of tweaks, got rid of the UPPER by changing how I handle the lower case range of characters, and changed numbers to binary_integers.

  FUNCTION CLEAN_STRING(IN_STRING in VARCHAR2) RETURN VARCHAR2 
  AS
    OUT_STRING VARCHAR2(32767) := '';
    IN_LENGTH BINARY_INTEGER;
    CNT BINARY_INTEGER := 0;
    IN_CHAR CHAR(1);
    OUT_CHAR CHAR(1);
    DEC_CHAR BINARY_INTEGER;
    PREV_SPACE BOOLEAN := FALSE;
  BEGIN
    IN_LENGTH := LENGTH(IN_STRING);
    WHILE CNT < IN_LENGTH
    LOOP
      CNT := CNT + 1;
      IN_CHAR := SUBSTR(IN_STRING, CNT, 1);
      DEC_CHAR := ASCII(IN_CHAR);
      -- blank out non alphanumerics
      IF ((DEC_CHAR >= 48 AND DEC_CHAR <= 57) OR
         (DEC_CHAR >= 65 AND DEC_CHAR <= 90)) 
      THEN
      --keep it
      OUT_CHAR := IN_CHAR;
      ELSE
        IF (DEC_CHAR >= 97 AND DEC_CHAR <= 122)
        THEN
          OUT_CHAR := CHR(DEC_CHAR - 32);
        ELSE
         OUT_CHAR := ' ';
        END IF;
      END IF;

      IF (NOT(PREV_SPACE AND OUT_CHAR = ' ')) 
      THEN
       OUT_STRING := OUT_STRING || OUT_CHAR;
      END IF;

      <<endloop>>
      IF (OUT_CHAR = ' ') THEN
       PREV_SPACE := TRUE;
      ELSE
       PREV_SPACE := FALSE;
      END IF;

    END LOOP;
    RETURN TRIM(OUT_STRING);
  END CLEAN_STRING;

I then created a simple test rig like tbone did, but I tested the three different routines against each other. First I verify that they all return the same results and then time each routine. Here is the test rig;

set serveroutput on
DECLARE
  CURSOR PATHMAST_CURS
  IS
    SELECT PATHMAST_TEXT_DIAGNOSIS FROM PATHMAST WHERE ROWNUM < 100000;
  DUMMY CLOB;
  DUMMY_1 CLOB;
  DUMMY_2 CLOB;
  l_interval interval day to second(4);
  l_start timestamp;
  l_end timestamp;
  diff_count_1 binary_integer := 0;
  diff_count_2 binary_integer := 0;
BEGIN

  FOR PATH_REC IN PATHMAST_CURS
  LOOP
    DUMMY := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '), '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
    DUMMY_1 := pathmast_utility_3.CLEAN_STRING(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '));
    DUMMY_2 := regexp_replace(trim(translate(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '),'abcdefghijklmnopqrstuvwxyz`~!@#$%^&*()''_+-={[}]|/\":;,.<>?µ’±€'||chr(9),'ABCDEFGHIJKLMNOPQRSTUVWXYZ                                     ')),'( )* ',' ');
    IF DUMMY_1 != DUMMY
    THEN
      diff_count_1 := diff_count_1 + 1;
    END IF;
    IF DUMMY_2 != DUMMY
    THEN
      diff_count_2 := diff_count_2 + 1;
      dbms_output.put_line('Regexp: ' || DUMMY);
      dbms_output.put_line('Translate: ' || DUMMY_2);
    END IF;
  END LOOP;
  dbms_output.put_line('CLEAN_STRING differences: ' || diff_count_1);
  dbms_output.put_line('Translate differences: ' || diff_count_2);


  l_start := systimestamp;
  FOR PATH_REC IN PATHMAST_CURS
  LOOP
    DUMMY := UPPER(TRIM(REGEXP_REPLACE(REGEXP_REPLACE(PATH_REC.PATHMAST_TEXT_DIAGNOSIS, '[^0-9A-Za-z ]', ' '),'( )* ',' ')));
  END LOOP;
  l_end := systimestamp;
  l_interval := l_end - l_start;
  dbms_output.put_line('Regexp approach timing: ' || l_interval);
  -------------------------------------------------    
  l_start := systimestamp;
  FOR PATH_REC IN PATHMAST_CURS
  LOOP
    DUMMY := pathmast_utility_3.CLEAN_STRING(PATH_REC.PATHMAST_TEXT_DIAGNOSIS);
  END LOOP;
  l_end := systimestamp;
  l_interval := l_end - l_start;
  dbms_output.put_line('CLEAN_STRING approach timing: ' || l_interval);
  -------------------------------------------------  
  l_start := systimestamp;
  FOR PATH_REC IN PATHMAST_CURS
  LOOP
    DUMMY := regexp_replace(trim(translate(NVL(PATH_REC.PATHMAST_TEXT_DIAGNOSIS,' '),'abcdefghijklmnopqrstuvwxyz`~!@#$%^&*()''_+-={[}]|/\":;,.<>?µ’±€'||chr(9),'ABCDEFGHIJKLMNOPQRSTUVWXYZ                                     ')),'( )* ',' ');
  END LOOP;
  l_end := systimestamp;
  l_interval := l_end - l_start;
  dbms_output.put_line('TRANSLATE approach timing: ' || l_interval);
  -------------------------------------------------  
END;

And here are the results;

anonymous block completed
CLEAN_STRING differences: 0
Translate differences: 0
Regexp approach timing: +00 00:00:52.9160
CLEAN_STRING approach timing: +00 00:00:05.5220
TRANSLATE approach timing: +00 00:00:13.4320

This is all without compiling native. So tbone is the big winner. Thank you tbone.

If for whatever reason you want/need to use the translate version, you should build the translate string programmatically in order to get all of the special characters.

Question 3

Perhaps, you can use TRANSLATE instead of regex to remove special characters and convert lower case to upper case.

regexp_replace(
               trim(
                    translate(x,
                              'abcdefghijklmnopqrstuvwxyz`~!@#$%^&*()_+-={[}]|/\"'':;,.<>?',
                              'ABCDEFGHIJKLMNOPQRSTUVWXYZ                                '
                             )
                   ),
                   ' {2,}',
                   ' '
              )

Tried it on a table with 1000 rows and column with random characters from anywhere between 1 to 4000. Resulted in around 35% less time.(Did not try in PLSQL).