'''
Implementation of the approximate string matching algorithm.
Copyright
'''
class PatternSearch:
'''
Class that solves the pattern search problem
Unit-Tests:
We do not expect students to test an empty text or pattern,
since we consider the case where both pattern and text consist
of at least one symbol the smallest reasonable case (We test
empty text and pattern anyway). However, some unit tests with
small inputs should be present. No need to test a typical
input besides the one given in the exercise.
>>> ps_test = PatternSearch()
>>> ps_test.text = ""
>>> ps_test.calculate_distance("")
0
>>> ps_test.text = "a"
>>> ps_test.calculate_distance("")
0
>>> ps_test.calculate_distance("a")
0
>>> ps_test.calculate_distance("b")
1
>>> ps_test.get_match(ps_test.calculate_distance("a"))
'a\\n'
>>> ps_test.get_match(ps_test.calculate_distance("b"))
'\\n'
'''
def __init__(self):
'''
Default constructor.
'''
self.pattern = "" # current search pattern
self.pattern_list = list() # list of search patterns
self.text = "" # search text
self.table = [[]] # stores E(i,b)
def read_text(self, file_name):
'''
Reads file with text to search for patterns.
'''
with open(file_name, "r") as text_file:
self.text = text_file.read().replace('\n', ' ')
def read_pattern(self, file_name):
'''
Reads file with patterns to search for
Param:
String file_name - file with patterns.
'''
pattern_file = open(file_name, "r")
for line in pattern_file:
current_pattern = line.strip('\n')
self.pattern_list.append(current_pattern)
def calculate_distance(self, pattern):
'''
Computes the editing distances E(i,b) by filling a table (bottom-up)
using the concept of dynamic programming
Param:
String pattern - pattern to look for in text
Returns:
edit distance between pattern and closest substring of text
'''
self.pattern = pattern
# initialize table
self.table = [
[0 for x in range(len(self.text)+1)]
for x in range(len(self.pattern)+1)
]
# insert base cases into table
for i in range(0, len(self.pattern)+1):
self.table[i][0] = i
# fill table bottom-up
for i in range(1, len(self.pattern)+1):
for b in range(1, len(self.text)+1):
c = 1
if (self.pattern[i-1] == self.text[b-1]):
# we have to subtract 1 from indices since pattern & text
# are 0-based 'arrays' and not 1-based as in the lecture.
c = 0
# recursive formula from the lecture
self.table[i][b] = min(self.table[i-1][b] + 1,
self.table[i][b-1] + 1,
self.table[i-1][b-1] + c)
# search for the smallest edit distance in the table
min_dist = max(len(self.text), len(self.pattern))
for i in range(1, len(self.text)+1):
if self.table[len(self.pattern)][i] < min_dist:
min_dist = self.table[len(self.pattern)][i]
return min_dist
def get_match(self, min_dist):
'''
Method that performs backtracking on the distance table to
determine the part of the text that gives a closest match
to the search pattern.
Param:
min_dist - value of edit distance
Returns:
int index of first closest to pattern substring in text.
'''
b = 0
i = len(self.pattern)
# search end_index b that minimizes self.table[len(self.pattern)][b]
while min_dist != self.table[i][b]:
b += 1
end_index = b
# search start_index b' by backtracking a path from
# self.table[len(self.pattern)][b] to self.table[0][b']
while i > 0 and b > 0:
if self.table[i][b] == self.table[i][b-1] + 1:
b -= 1
elif self.table[i][b] == self.table[i-1][b] + 1:
i -= 1
else:
c = 1
if self.pattern[i-1] == self.text[b-1]:
c = 0
if self.table[i][b] == self.table[i-1][b-1] + c:
b -= 1
i -= 1
start_index = b
return self.text[start_index: end_index] + '\n'
def find_all_patterns(self, file_name):
'''
Method that computes minimal Edit Distance and index
of first substring of all patterns and saves the result
Param:
file_name - name of file where text substring
with smallest edit distance to all patterns
are going to be saved.
'''
out = open(file_name, "w")
for p in self.pattern_list:
dist = self.calculate_distance(p)
out.write(self.get_match(dist))
out.close()
@classmethod
def solve(cls):
'''
Class method that solves the task.
'''
ps = cls()
ps.read_text("text.txt")
ps.read_pattern("patterns.txt")
ps.find_all_patterns("result.txt")
PatternSearch.solve()