跳轉到內容

演算法實現/字串搜尋/Knuth-Morris-Pratt 模式匹配器

來自華夏公益教科書,開放的書籍,開放的世界
// COPY, COMPILE, AND RUN ON DEV-PASCAL
// DL LINK: http://download.cnet.com/Bloodshed-Dev-Pascal/3000-2069_4-10019919.html

// PROGRAM BY : AMMAR QADRI
// FROM       : WOBURN COLLEGIATE INSTITUTE

{$H+}
uses crt,Dos;
var
 h1,h2,m1,m2,se1,se2,ms1,ms2:word;
 b,comparisons:longint;
 t,matchfound:array[0..1000000] of longint;
 s,f:string;
  procedure START;
  begin
    GETTIME(h1,m1,se1,ms1);
  end;
  procedure FINISH;
  begin
    GETTIME(h2,m2,se2,ms2);
    writeln;
    writeln('**************************************************************');
    writeln('**************************************************************');
    writeln('************************ COMPARISONS : ',comparisons:4,' ******************');
    writeln('**************************************************************');
    writeln('************************ TEXT LENGTH : ',length(f):4,' ******************');
    writeln('**************************************************************');
    writeln('***************************** RUNTIME ************************');
    writeln('*************************** ',((se2-se1)+(m2-m1)*60+(h2-h1)*60*60+(ms2-ms1)/100):0:3,' SECONDS ********************');
    writeln('**************************************************************');
    writeln('************** IMPLEMENTATION # 001 - KMP AGLORITHM **********');
    writeln('**************************************************************');
    readln;
  end;
  procedure maketable;
  var
    x,y:longint;
  begin
    t[0]:=0;  {empty string}
    t[1]:=0;  {single letter}
    x:=1;  {match of characters found plus 1}
    y:=2;  {position in the pattern string starts off as 2}
           {since t[0] and t[1] are already set to 0 as default}
    while y <= length(s) do begin
          if s[x] = s[y] then begin   {found another match}
             t[y]:=x;
             inc(x);
             inc(y);
          end
          else begin
               if x <> 0 then begin    {did not find match so trying for a smaller one}
                  x:=t[x];
               end;
               if x = 0 then begin     {smallest match found (empty string)}
                  t[y]:=x;
                  inc(x);
                  inc(y);
               end;
          end;
    end;
  end;
 procedure printtable;
  var
    aaa:longint;
  begin
    writeln('TABLE : ');
    for aaa:= 0 to length(s) do begin
        writeln(aaa:4,' : ',t[aaa]:4);
    end;
    writeln;
  end;
  procedure findall;
  var
  x,y:longint;
  begin
     x:=0; {match so far (empty string)}
     y:=1; {position in text (looking for a place to begin the search)}
     while y+length(s)-1  <= length(f) do begin
           if s[x+1] = f[y+x] then begin
              while s[x+1] = f[y+x] do begin
                    {write(upcase(s[x+1]));   }
                    inc(comparisons);
                    inc(x);
                    if x = length(s) then begin
                       inc(b);
                       matchfound[b]:=y;
                       break;
                    end;
              end;
             { writeln;
              if x <> length(s) then
              writeln(y+x,' ',s[x+1]);   }
              y:=y+x-t[x];
              x:=t[x];
           end
}
                inc(comparisons);
                x:=0;
                inc(y);
           end;
     end;
  end;
  procedure printall;
  var
   aaa:longint;
  begin
    writeln('MATCHES : ');
    for aaa:= 1 to b do begin
        writeln(aaa:4,' : ',matchfound[aaa]:4);
    end;
    if b = 0 then writeln('NONE =P');
  end;
begin
  //read in the two strings...the text and the pattern
  write('TEXT   : ');
  readln(f);  {text}
  write('PATTERN: ');
  readln(s);  {pattern}
  writeln;
  START;{note start time of program}
  maketable;{makes the table}
  printtable;{prints the table}
  b:=0;  {initialize matchfound array}
  comparisons:=0;
  findall;{finds the matches}
  printall;{prints the matches}
  FINISH;{note finish time of program and print}
end.
# Knuth-Morris-Pratt string matching
# David Eppstein, UC Irvine, 1 Mar 2002

#from http://code.activestate.com/recipes/117214/
def KnuthMorrisPratt(text, pattern):

    '''Yields all starting positions of copies of the pattern in the text.
Calling conventions are similar to string.find, but its arguments can be
lists or iterators, not just strings, it returns all matches, not just
the first one, and it does not need the whole text in memory at once.
Whenever it yields, it will have read the text exactly up to and including
the match that caused the yield.'''

    # allow indexing into pattern and protect against change during yield
    pattern = list(pattern)

    # build table of shift amounts
    shifts = [1] * (len(pattern) + 1)
    shift = 1
    for pos in range(len(pattern)):
        while shift <= pos and pattern[pos] != pattern[pos-shift]:
            shift += shifts[pos-shift]
        shifts[pos+1] = shift

    # do the actual search
    startPos = 0
    matchLen = 0
    for c in text:
        while matchLen == len(pattern) or \
              matchLen >= 0 and pattern[matchLen] != c:
            startPos += shifts[matchLen]
            matchLen -= shifts[matchLen]
        matchLen += 1
        if matchLen == len(pattern):
            yield startPos

以下 Ada 實現包含演算法和測試程式,用於測試實現的正確性。

Ada 程式設計/演算法/Knuth-Morris-Pratt 模式匹配器

以下 C++ 實現僅包含演算法,沒有測試程式來測試實現的正確性。

#include <iostream>
#include <vector>
using namespace std;

//----------------------------
// Returns a vector containing the zero based index of
//  the start of each match of the string K in S.
//  Matches may overlap
//----------------------------
vector<int> KMP(string S, string K)
{
    vector<int> T(K.size() + 1, -1);
    vector<int> matches;

    if (K.size() == 0) {
        matches.push_back(0);
        return matches;
    }

    for (int i = 1; i <= K.size(); i++) {
        int pos = T[i - 1];
        while (pos != -1 && K[pos] != K[i - 1])
            pos = T[pos];
        T[i] = pos + 1;
    }

    int sp = 0;
    int kp = 0;
    while (sp < S.size()) {
        while (kp != -1 && (kp == K.size() || K[kp] != S[sp]))
            kp = T[kp];
        kp++;
        sp++;
        if (kp == K.size())
            matches.push_back(sp - K.size());
    }

    return matches;
}

來自維基百科頁面歷史記錄的 C 實現,經過少量修改。

https://en.wikipedia.org/w/index.php?title=Knuth%E2%80%93Morris%E2%80%93Pratt_algorithm&oldid=781303383

#include <stdio.h>
#include <string.h>
#include <stdlib.h>

void computeLPSArray(char *pat, int M, int *lps);

void KMPSearch(char *pat, char *txt) {
    int M = strlen(pat);
    int N = strlen(txt);

    //create lps[] that will hold the longest prefix suffix values for pattern
    int *lps = (int *)malloc(sizeof(int) * (M + 1));

    //Preprocess the pattern (calculate lps[] array)
    computeLPSArray(pat, M, lps);

    int i = 0; //index for txt[]
    int j = 0; //index for pat[]
    
    //at this point i may be incremented while i < N && txt[i] != pat[0] - for performance 
    while (i < N) {
        if (pat[j] == txt[i]) {
            i++;
            j++;
            if (j == M) {
                printf("Found pattern at index %d \n", i-j);
                j = lps[j];
            }
        }
        else {//if (pat[j] != txt[i]) //mismatch after j matches
        //Pattern shift
            j = lps[j];
            if (j < 0) {
                i++;
                j++;
            //at this point i may be incremented while i < N && txt[i] != pat[0] - for performance 
            }
        }
    }
    free(lps); //to avoid memory leak
}

void computeLPSArray(char *pat, int M, int *lps) {
    lps[0] = -1;
    int i = 1;
    int j = 0; 
    while (i < M) {
        if (pat[j] == pat[i]) {
            lps[i] = lps[j];
            i++;
            j++;
        }
        else {// (pat[j] != pat[i])
            lps[i] = j;
            j = lps[j];
            while (j >= 0 && pat[j] != pat[i]) {
                j = lps[j];
            }
            i++;
            j++;
        }
    }
    lps[i] = j;
}

// Driver program to test above function
int main() {
    char *txt = "apurba mandal loves ayoshi loves";
    char *pat = "loves";
    KMPSearch(pat, txt);
    return 0;
}
func computeLPS(pat string) (lps []int) {
    M := len(pat)
    lps = make([]int, M) // lps[0] = 0
 
    l := 0 // length of the previous longest prefix suffix
 
    // the loop calculates lps[i] for i = 1 to M-1
    for i := 1; i < M; i++ {
        for {
            if pat[i] == pat[l] {
                l++
                break
            }
 
            if l == 0 {
                break
            }
 
            l = lps[l-1]
        }
        lps[i] = l
    }
    return lps
}
 
func KMPSearch(pat, txt string) {
    M, N := len(pat), len(txt)
 
    // Preprocess the pattern that will hold the longest prefix suffix values for pattern
    lps := computeLPS(pat)
 
    for i, j := 0, 0; i < N; i++ {
        for {
            if pat[j] == txt[i] {
                j++
 
                if j == M {
                    fmt.Printf("Found pattern at index %d \n", i-j+1)
                    j = lps[j-1]
                }
                break
            }
 
            if j > 0 {
                j = lps[j-1]
            } else {
                break
            }
        }
    }
}

此實現需要 Lua 版本 5.1 或更高版本。

-- Implementation of the Knuth Morris Pratt algorithm to find
-- substrings within strings. 
-- Sean Brewer
-- Berry College CS 320
-- March 25, 2008

-- Generate the failure table using the substring and the length
-- of the substring
function generate_fail_table(substring,sublen)
	 comparisons = 0
	 fail = {0}
	 for i=2,sublen do
	     temp = fail[i - 1]
	     while temp > 0 and string.sub(substring,temp,temp) ~= string.sub(substring,i - 1,i - 1) do
		   comparisons = comparisons + 1
	     	   temp = fail[temp]
	     end
	 fail[i] = temp + 1
	 end

	 return {fail, comparisons + 1}
end

--The actual kmp algorithm which takes
--the substring and text as arguments
function kmp_algorithm(substring,text)
	 --starting positions...
	 subloc = 1
	 textloc = 1
	 
	 --get the length of substring and text..
	 sublen = string.len(substring)
	 textlen = string.len(text)
	 
	 --generate the fail links...
	 failure = generate_fail_table(substring,sublen)
	 
	 --store failure comparisons
	 comparisons = failure[2]
	 
	 --store fail table
	 fail = failure[1]
	 
	 --the main loop..
	 while textloc <= textlen and subloc <= sublen do
	       
	       if subloc == 0 or string.sub(text,textloc,textloc) == string.sub(substring,subloc,subloc) then
		  comparisons = comparisons + 1
	       	  textloc = textloc + 1
		  subloc = subloc + 1
	       else --no match found so follow fail link
	          subloc = fail[subloc]
	       end
	 end

	 if subloc > sublen then
	    return {textloc - sublen, comparisons}
	 else
	    return {0, comparisons}
	 end	

end
static int[] GetPrefix(string s)
{
    int[] result = new int[s.Length];
    result[0] = 0;
    int index = 0;

    for (int i = 1; i < s.Length; i++)
    {
        int k = result[i - 1];
        while (s[k] != s[i] && k > 0)
        {
            k = result[k - 1];
        }
        if (s[k] == s[i])
        {
            result[i] = k + 1;
        }
        else
        {
            result[i] = 0;
        }
    }
    return result;
}

static int FindSubstring(string pattern, string text)
{
    int[] pf = GetPrefix(pattern);
    int index = 0;

    for (int i = 0; i < text.Length; i++)
    {
        while (index > 0 && pattern[index] != text[i]) { index = pf[index - 1]; }
        if (pattern[index] == text[i]) index++;
        if (index == pattern.Length)
        {
            return i - index + 1;
        }
    }

    return -1;
}
華夏公益教科書