If the patterns are short (e.g. 3 or 4 letters) and the strings are very long, I guess you might as well count every pattern in the string:
DATA have;
INFILE DATALINES TRUNCOVER;
INPUT DNA_STR $85.;
str_id + 1;
DATALINES;
CGGAGGACXYZTCTAGGTAXYZACGCTTATCAGXYZGTCCATAGGACATXYZTCG123CTCTAGGXYZGAATCAGGTGCT12TC
CGGA456CABCTCTAGGTAABCACGCTTATCAG123GTCCATAGGACATXYZTCGGAACTCTAGGABCGAATCAG987CTTATC
;
data want;
array patern {64} $3 _temporary_;
if _n_ = 1 then do;
do c1 = "A", "T", "G", "C";
do c2 = "A", "T", "G", "C";
do c3 = "A", "T", "G", "C";
i + 1;
patern{i} = cats(c1, c2, c3);
end;
end;
end;
end;
set have;
do p = 1 to dim(patern);
count = 0; pos = 1; pat = patern{p};
do i = 1 to 9999 until(pos=0);
pos = find(DNA_STR, pat, pos);
if pos > 0 then do;
count = count + 1;
pos = pos + length(pat);
end;
end;
if count > 1 then output;
end;
keep str_id pat count;
run;
proc sql;
select * from want group by str_id having count = max(count);
quit;
... View more