Skip to content

Commit 8349b87

Browse files
authored
Merge pull request #67 from Boris-2021/master
Add a matching pattern
2 parents 758a94c + cb7944d commit 8349b87

8 files changed

+674
-137
lines changed

drain3/drain.py

+139-132
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def __init__(self):
4949
self.cluster_ids: List[int] = []
5050

5151

52-
class Drain:
52+
class DrainBase:
5353
def __init__(self,
5454
depth=4,
5555
sim_th=0.4,
@@ -103,6 +103,144 @@ def clusters(self):
103103
def has_numbers(s):
104104
return any(char.isdigit() for char in s)
105105

106+
def fast_match(self, cluster_ids: Sequence, tokens: list, sim_th: float, include_params: bool):
107+
"""
108+
Find the best match for a log message (represented as tokens) versus a list of clusters
109+
:param cluster_ids: List of clusters to match against (represented by their IDs)
110+
:param tokens: the log message, separated to tokens.
111+
:param sim_th: minimum required similarity threshold (None will be returned in no clusters reached it)
112+
:param include_params: consider tokens matched to wildcard parameters in similarity threshold.
113+
:return: Best match cluster or None
114+
"""
115+
match_cluster = None
116+
117+
max_sim = -1
118+
max_param_count = -1
119+
max_cluster = None
120+
121+
for cluster_id in cluster_ids:
122+
# Try to retrieve cluster from cache with bypassing eviction
123+
# algorithm as we are only testing candidates for a match.
124+
cluster = self.id_to_cluster.get(cluster_id)
125+
if cluster is None:
126+
continue
127+
cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens, include_params)
128+
if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count):
129+
max_sim = cur_sim
130+
max_param_count = param_count
131+
max_cluster = cluster
132+
133+
if max_sim >= sim_th:
134+
match_cluster = max_cluster
135+
136+
return match_cluster
137+
138+
def print_tree(self, file=None, max_clusters=5):
139+
self.print_node("root", self.root_node, 0, file, max_clusters)
140+
141+
def print_node(self, token, node, depth, file, max_clusters):
142+
out_str = '\t' * depth
143+
144+
if depth == 0:
145+
out_str += f'<{token}>'
146+
elif depth == 1:
147+
if token.isdigit():
148+
out_str += f'<L={token}>'
149+
else:
150+
out_str += f'<{token}>'
151+
else:
152+
out_str += f'"{token}"'
153+
154+
if len(node.cluster_ids) > 0:
155+
out_str += f" (cluster_count={len(node.cluster_ids)})"
156+
157+
print(out_str, file=file)
158+
159+
for token, child in node.key_to_child_node.items():
160+
self.print_node(token, child, depth + 1, file, max_clusters)
161+
162+
for cid in node.cluster_ids[:max_clusters]:
163+
cluster = self.id_to_cluster[cid]
164+
out_str = '\t' * (depth + 1) + str(cluster)
165+
print(out_str, file=file)
166+
167+
def get_content_as_tokens(self, content):
168+
content = content.strip()
169+
for delimiter in self.extra_delimiters:
170+
content = content.replace(delimiter, " ")
171+
content_tokens = content.split()
172+
return content_tokens
173+
174+
def add_log_message(self, content: str):
175+
content_tokens = self.get_content_as_tokens(content)
176+
177+
if self.profiler:
178+
self.profiler.start_section("tree_search")
179+
match_cluster = self.tree_search(self.root_node, content_tokens, self.sim_th, False)
180+
if self.profiler:
181+
self.profiler.end_section()
182+
183+
# Match no existing log cluster
184+
if match_cluster is None:
185+
if self.profiler:
186+
self.profiler.start_section("create_cluster")
187+
self.clusters_counter += 1
188+
cluster_id = self.clusters_counter
189+
match_cluster = LogCluster(content_tokens, cluster_id)
190+
self.id_to_cluster[cluster_id] = match_cluster
191+
self.add_seq_to_prefix_tree(self.root_node, match_cluster)
192+
update_type = "cluster_created"
193+
194+
# Add the new log message to the existing cluster
195+
else:
196+
if self.profiler:
197+
self.profiler.start_section("cluster_exist")
198+
new_template_tokens = self.create_template(content_tokens, match_cluster.log_template_tokens)
199+
if tuple(new_template_tokens) == match_cluster.log_template_tokens:
200+
update_type = "none"
201+
else:
202+
match_cluster.log_template_tokens = tuple(new_template_tokens)
203+
update_type = "cluster_template_changed"
204+
match_cluster.size += 1
205+
# Touch cluster to update its state in the cache.
206+
# noinspection PyStatementEffect
207+
self.id_to_cluster[match_cluster.cluster_id]
208+
209+
if self.profiler:
210+
self.profiler.end_section()
211+
212+
return match_cluster, update_type
213+
214+
def get_total_cluster_size(self):
215+
size = 0
216+
for c in self.id_to_cluster.values():
217+
size += c.size
218+
return size
219+
220+
def get_clusters_ids_for_seq_len(self, seq_fir):
221+
"""
222+
seq_fir: int/str - the first token of the sequence
223+
Return all clusters with the specified count of tokens
224+
"""
225+
226+
def append_clusters_recursive(node: Node, id_list_to_fill: list):
227+
id_list_to_fill.extend(node.cluster_ids)
228+
for child_node in node.key_to_child_node.values():
229+
append_clusters_recursive(child_node, id_list_to_fill)
230+
231+
cur_node = self.root_node.key_to_child_node.get(str(seq_fir))
232+
233+
# no template with same token count
234+
if cur_node is None:
235+
return []
236+
237+
target = []
238+
append_clusters_recursive(cur_node, target)
239+
return target
240+
241+
242+
class Drain(DrainBase):
243+
106244
def tree_search(self, root_node: Node, tokens: list, sim_th: float, include_params: bool):
107245

108246
# at first level, children are grouped by token (word) count
@@ -232,38 +370,6 @@ def get_seq_distance(self, seq1, seq2, include_params: bool):
232370

233371
return ret_val, param_count
234372

235-
def fast_match(self, cluster_ids: Sequence, tokens: list, sim_th: float, include_params: bool):
236-
"""
237-
Find the best match for a log message (represented as tokens) versus a list of clusters
238-
:param cluster_ids: List of clusters to match against (represented by their IDs)
239-
:param tokens: the log message, separated to tokens.
240-
:param sim_th: minimum required similarity threshold (None will be returned in no clusters reached it)
241-
:param include_params: consider tokens matched to wildcard parameters in similarity threshold.
242-
:return: Best match cluster or None
243-
"""
244-
match_cluster = None
245-
246-
max_sim = -1
247-
max_param_count = -1
248-
max_cluster = None
249-
250-
for cluster_id in cluster_ids:
251-
# Try to retrieve cluster from cache with bypassing eviction
252-
# algorithm as we are only testing candidates for a match.
253-
cluster = self.id_to_cluster.get(cluster_id)
254-
if cluster is None:
255-
continue
256-
cur_sim, param_count = self.get_seq_distance(cluster.log_template_tokens, tokens, include_params)
257-
if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count):
258-
max_sim = cur_sim
259-
max_param_count = param_count
260-
max_cluster = cluster
261-
262-
if max_sim >= sim_th:
263-
match_cluster = max_cluster
264-
265-
return match_cluster
266-
267373
def create_template(self, seq1, seq2):
268374
assert len(seq1) == len(seq2)
269375
ret_val = list(seq2)
@@ -274,99 +380,6 @@ def create_template(self, seq1, seq2):
274380

275381
return ret_val
276382

277-
def print_tree(self, file=None, max_clusters=5):
278-
self.print_node("root", self.root_node, 0, file, max_clusters)
279-
280-
def print_node(self, token, node, depth, file, max_clusters):
281-
out_str = '\t' * depth
282-
283-
if depth == 0:
284-
out_str += f'<{token}>'
285-
elif depth == 1:
286-
out_str += f'<L={token}>'
287-
else:
288-
out_str += f'"{token}"'
289-
290-
if len(node.cluster_ids) > 0:
291-
out_str += f" (cluster_count={len(node.cluster_ids)})"
292-
293-
print(out_str, file=file)
294-
295-
for token, child in node.key_to_child_node.items():
296-
self.print_node(token, child, depth + 1, file, max_clusters)
297-
298-
for cid in node.cluster_ids[:max_clusters]:
299-
cluster = self.id_to_cluster[cid]
300-
out_str = '\t' * (depth + 1) + str(cluster)
301-
print(out_str, file=file)
302-
303-
def get_content_as_tokens(self, content):
304-
content = content.strip()
305-
for delimiter in self.extra_delimiters:
306-
content = content.replace(delimiter, " ")
307-
content_tokens = content.split()
308-
return content_tokens
309-
310-
def add_log_message(self, content: str):
311-
content_tokens = self.get_content_as_tokens(content)
312-
313-
if self.profiler:
314-
self.profiler.start_section("tree_search")
315-
match_cluster = self.tree_search(self.root_node, content_tokens, self.sim_th, False)
316-
if self.profiler:
317-
self.profiler.end_section()
318-
319-
# Match no existing log cluster
320-
if match_cluster is None:
321-
if self.profiler:
322-
self.profiler.start_section("create_cluster")
323-
self.clusters_counter += 1
324-
cluster_id = self.clusters_counter
325-
match_cluster = LogCluster(content_tokens, cluster_id)
326-
self.id_to_cluster[cluster_id] = match_cluster
327-
self.add_seq_to_prefix_tree(self.root_node, match_cluster)
328-
update_type = "cluster_created"
329-
330-
# Add the new log message to the existing cluster
331-
else:
332-
if self.profiler:
333-
self.profiler.start_section("cluster_exist")
334-
new_template_tokens = self.create_template(content_tokens, match_cluster.log_template_tokens)
335-
if tuple(new_template_tokens) == match_cluster.log_template_tokens:
336-
update_type = "none"
337-
else:
338-
match_cluster.log_template_tokens = tuple(new_template_tokens)
339-
update_type = "cluster_template_changed"
340-
match_cluster.size += 1
341-
# Touch cluster to update its state in the cache.
342-
# noinspection PyStatementEffect
343-
self.id_to_cluster[match_cluster.cluster_id]
344-
345-
if self.profiler:
346-
self.profiler.end_section()
347-
348-
return match_cluster, update_type
349-
350-
def get_clusters_ids_for_seq_len(self, seq_len: int):
351-
"""
352-
Return all clusters with the specified count of tokens
353-
"""
354-
355-
def append_clusters_recursive(node: Node, id_list_to_fill: list):
356-
id_list_to_fill.extend(node.cluster_ids)
357-
for child_node in node.key_to_child_node.values():
358-
append_clusters_recursive(child_node, id_list_to_fill)
359-
360-
cur_node = self.root_node.key_to_child_node.get(str(seq_len))
361-
362-
# no template with same token count
363-
if cur_node is None:
364-
return []
365-
366-
target = []
367-
append_clusters_recursive(cur_node, target)
368-
return target
369-
370383
def match(self, content: str, full_search_strategy="never"):
371384
"""
372385
Match log message against an already existing cluster.
@@ -413,9 +426,3 @@ def full_search():
413426
return None
414427

415428
return full_search()
416-
417-
def get_total_cluster_size(self):
418-
size = 0
419-
for c in self.id_to_cluster.values():
420-
size += c.size
421-
return size

0 commit comments

Comments
 (0)