@@ -49,7 +49,7 @@ def __init__(self):
49
49
self .cluster_ids : List [int ] = []
50
50
51
51
52
- class Drain :
52
+ class DrainBase :
53
53
def __init__ (self ,
54
54
depth = 4 ,
55
55
sim_th = 0.4 ,
@@ -103,6 +103,144 @@ def clusters(self):
103
103
def has_numbers (s ):
104
104
return any (char .isdigit () for char in s )
105
105
106
+ def fast_match (self , cluster_ids : Sequence , tokens : list , sim_th : float , include_params : bool ):
107
+ """
108
+ Find the best match for a log message (represented as tokens) versus a list of clusters
109
+ :param cluster_ids: List of clusters to match against (represented by their IDs)
110
+ :param tokens: the log message, separated to tokens.
111
+ :param sim_th: minimum required similarity threshold (None will be returned in no clusters reached it)
112
+ :param include_params: consider tokens matched to wildcard parameters in similarity threshold.
113
+ :return: Best match cluster or None
114
+ """
115
+ match_cluster = None
116
+
117
+ max_sim = - 1
118
+ max_param_count = - 1
119
+ max_cluster = None
120
+
121
+ for cluster_id in cluster_ids :
122
+ # Try to retrieve cluster from cache with bypassing eviction
123
+ # algorithm as we are only testing candidates for a match.
124
+ cluster = self .id_to_cluster .get (cluster_id )
125
+ if cluster is None :
126
+ continue
127
+ cur_sim , param_count = self .get_seq_distance (cluster .log_template_tokens , tokens , include_params )
128
+ if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count ):
129
+ max_sim = cur_sim
130
+ max_param_count = param_count
131
+ max_cluster = cluster
132
+
133
+ if max_sim >= sim_th :
134
+ match_cluster = max_cluster
135
+
136
+ return match_cluster
137
+
138
+ def print_tree (self , file = None , max_clusters = 5 ):
139
+ self .print_node ("root" , self .root_node , 0 , file , max_clusters )
140
+
141
+ def print_node (self , token , node , depth , file , max_clusters ):
142
+ out_str = '\t ' * depth
143
+
144
+ if depth == 0 :
145
+ out_str += f'<{ token } >'
146
+ elif depth == 1 :
147
+ if token .isdigit ():
148
+ out_str += f'<L={ token } >'
149
+ else :
150
+ out_str += f'<{ token } >'
151
+ else :
152
+ out_str += f'"{ token } "'
153
+
154
+ if len (node .cluster_ids ) > 0 :
155
+ out_str += f" (cluster_count={ len (node .cluster_ids )} )"
156
+
157
+ print (out_str , file = file )
158
+
159
+ for token , child in node .key_to_child_node .items ():
160
+ self .print_node (token , child , depth + 1 , file , max_clusters )
161
+
162
+ for cid in node .cluster_ids [:max_clusters ]:
163
+ cluster = self .id_to_cluster [cid ]
164
+ out_str = '\t ' * (depth + 1 ) + str (cluster )
165
+ print (out_str , file = file )
166
+
167
+ def get_content_as_tokens (self , content ):
168
+ content = content .strip ()
169
+ for delimiter in self .extra_delimiters :
170
+ content = content .replace (delimiter , " " )
171
+ content_tokens = content .split ()
172
+ return content_tokens
173
+
174
+ def add_log_message (self , content : str ):
175
+ content_tokens = self .get_content_as_tokens (content )
176
+
177
+ if self .profiler :
178
+ self .profiler .start_section ("tree_search" )
179
+ match_cluster = self .tree_search (self .root_node , content_tokens , self .sim_th , False )
180
+ if self .profiler :
181
+ self .profiler .end_section ()
182
+
183
+ # Match no existing log cluster
184
+ if match_cluster is None :
185
+ if self .profiler :
186
+ self .profiler .start_section ("create_cluster" )
187
+ self .clusters_counter += 1
188
+ cluster_id = self .clusters_counter
189
+ match_cluster = LogCluster (content_tokens , cluster_id )
190
+ self .id_to_cluster [cluster_id ] = match_cluster
191
+ self .add_seq_to_prefix_tree (self .root_node , match_cluster )
192
+ update_type = "cluster_created"
193
+
194
+ # Add the new log message to the existing cluster
195
+ else :
196
+ if self .profiler :
197
+ self .profiler .start_section ("cluster_exist" )
198
+ new_template_tokens = self .create_template (content_tokens , match_cluster .log_template_tokens )
199
+ if tuple (new_template_tokens ) == match_cluster .log_template_tokens :
200
+ update_type = "none"
201
+ else :
202
+ match_cluster .log_template_tokens = tuple (new_template_tokens )
203
+ update_type = "cluster_template_changed"
204
+ match_cluster .size += 1
205
+ # Touch cluster to update its state in the cache.
206
+ # noinspection PyStatementEffect
207
+ self .id_to_cluster [match_cluster .cluster_id ]
208
+
209
+ if self .profiler :
210
+ self .profiler .end_section ()
211
+
212
+ return match_cluster , update_type
213
+
214
+ def get_total_cluster_size (self ):
215
+ size = 0
216
+ for c in self .id_to_cluster .values ():
217
+ size += c .size
218
+ return size
219
+
220
+ def get_clusters_ids_for_seq_len (self , seq_fir ):
221
+ """
222
+ seq_fir: int/str - the first token of the sequence
223
+ Return all clusters with the specified count of tokens
224
+ """
225
+
226
+ def append_clusters_recursive (node : Node , id_list_to_fill : list ):
227
+ id_list_to_fill .extend (node .cluster_ids )
228
+ for child_node in node .key_to_child_node .values ():
229
+ append_clusters_recursive (child_node , id_list_to_fill )
230
+
231
+ cur_node = self .root_node .key_to_child_node .get (str (seq_fir ))
232
+
233
+ # no template with same token count
234
+ if cur_node is None :
235
+ return []
236
+
237
+ target = []
238
+ append_clusters_recursive (cur_node , target )
239
+ return target
240
+
241
+
242
+ class Drain (DrainBase ):
243
+
106
244
def tree_search (self , root_node : Node , tokens : list , sim_th : float , include_params : bool ):
107
245
108
246
# at first level, children are grouped by token (word) count
@@ -232,38 +370,6 @@ def get_seq_distance(self, seq1, seq2, include_params: bool):
232
370
233
371
return ret_val , param_count
234
372
235
- def fast_match (self , cluster_ids : Sequence , tokens : list , sim_th : float , include_params : bool ):
236
- """
237
- Find the best match for a log message (represented as tokens) versus a list of clusters
238
- :param cluster_ids: List of clusters to match against (represented by their IDs)
239
- :param tokens: the log message, separated to tokens.
240
- :param sim_th: minimum required similarity threshold (None will be returned in no clusters reached it)
241
- :param include_params: consider tokens matched to wildcard parameters in similarity threshold.
242
- :return: Best match cluster or None
243
- """
244
- match_cluster = None
245
-
246
- max_sim = - 1
247
- max_param_count = - 1
248
- max_cluster = None
249
-
250
- for cluster_id in cluster_ids :
251
- # Try to retrieve cluster from cache with bypassing eviction
252
- # algorithm as we are only testing candidates for a match.
253
- cluster = self .id_to_cluster .get (cluster_id )
254
- if cluster is None :
255
- continue
256
- cur_sim , param_count = self .get_seq_distance (cluster .log_template_tokens , tokens , include_params )
257
- if cur_sim > max_sim or (cur_sim == max_sim and param_count > max_param_count ):
258
- max_sim = cur_sim
259
- max_param_count = param_count
260
- max_cluster = cluster
261
-
262
- if max_sim >= sim_th :
263
- match_cluster = max_cluster
264
-
265
- return match_cluster
266
-
267
373
def create_template (self , seq1 , seq2 ):
268
374
assert len (seq1 ) == len (seq2 )
269
375
ret_val = list (seq2 )
@@ -274,99 +380,6 @@ def create_template(self, seq1, seq2):
274
380
275
381
return ret_val
276
382
277
- def print_tree (self , file = None , max_clusters = 5 ):
278
- self .print_node ("root" , self .root_node , 0 , file , max_clusters )
279
-
280
- def print_node (self , token , node , depth , file , max_clusters ):
281
- out_str = '\t ' * depth
282
-
283
- if depth == 0 :
284
- out_str += f'<{ token } >'
285
- elif depth == 1 :
286
- out_str += f'<L={ token } >'
287
- else :
288
- out_str += f'"{ token } "'
289
-
290
- if len (node .cluster_ids ) > 0 :
291
- out_str += f" (cluster_count={ len (node .cluster_ids )} )"
292
-
293
- print (out_str , file = file )
294
-
295
- for token , child in node .key_to_child_node .items ():
296
- self .print_node (token , child , depth + 1 , file , max_clusters )
297
-
298
- for cid in node .cluster_ids [:max_clusters ]:
299
- cluster = self .id_to_cluster [cid ]
300
- out_str = '\t ' * (depth + 1 ) + str (cluster )
301
- print (out_str , file = file )
302
-
303
- def get_content_as_tokens (self , content ):
304
- content = content .strip ()
305
- for delimiter in self .extra_delimiters :
306
- content = content .replace (delimiter , " " )
307
- content_tokens = content .split ()
308
- return content_tokens
309
-
310
- def add_log_message (self , content : str ):
311
- content_tokens = self .get_content_as_tokens (content )
312
-
313
- if self .profiler :
314
- self .profiler .start_section ("tree_search" )
315
- match_cluster = self .tree_search (self .root_node , content_tokens , self .sim_th , False )
316
- if self .profiler :
317
- self .profiler .end_section ()
318
-
319
- # Match no existing log cluster
320
- if match_cluster is None :
321
- if self .profiler :
322
- self .profiler .start_section ("create_cluster" )
323
- self .clusters_counter += 1
324
- cluster_id = self .clusters_counter
325
- match_cluster = LogCluster (content_tokens , cluster_id )
326
- self .id_to_cluster [cluster_id ] = match_cluster
327
- self .add_seq_to_prefix_tree (self .root_node , match_cluster )
328
- update_type = "cluster_created"
329
-
330
- # Add the new log message to the existing cluster
331
- else :
332
- if self .profiler :
333
- self .profiler .start_section ("cluster_exist" )
334
- new_template_tokens = self .create_template (content_tokens , match_cluster .log_template_tokens )
335
- if tuple (new_template_tokens ) == match_cluster .log_template_tokens :
336
- update_type = "none"
337
- else :
338
- match_cluster .log_template_tokens = tuple (new_template_tokens )
339
- update_type = "cluster_template_changed"
340
- match_cluster .size += 1
341
- # Touch cluster to update its state in the cache.
342
- # noinspection PyStatementEffect
343
- self .id_to_cluster [match_cluster .cluster_id ]
344
-
345
- if self .profiler :
346
- self .profiler .end_section ()
347
-
348
- return match_cluster , update_type
349
-
350
- def get_clusters_ids_for_seq_len (self , seq_len : int ):
351
- """
352
- Return all clusters with the specified count of tokens
353
- """
354
-
355
- def append_clusters_recursive (node : Node , id_list_to_fill : list ):
356
- id_list_to_fill .extend (node .cluster_ids )
357
- for child_node in node .key_to_child_node .values ():
358
- append_clusters_recursive (child_node , id_list_to_fill )
359
-
360
- cur_node = self .root_node .key_to_child_node .get (str (seq_len ))
361
-
362
- # no template with same token count
363
- if cur_node is None :
364
- return []
365
-
366
- target = []
367
- append_clusters_recursive (cur_node , target )
368
- return target
369
-
370
383
def match (self , content : str , full_search_strategy = "never" ):
371
384
"""
372
385
Match log message against an already existing cluster.
@@ -413,9 +426,3 @@ def full_search():
413
426
return None
414
427
415
428
return full_search ()
416
-
417
- def get_total_cluster_size (self ):
418
- size = 0
419
- for c in self .id_to_cluster .values ():
420
- size += c .size
421
- return size
0 commit comments