2626from evaluation .utils .ED_calculator import calculate_edit_distance
2727import pandas as pd
2828
29+ _compiled_template_cache = {}
30+ _compiled_regex_cache = {}
2931
3032def prepare_results (output_dir , otc ):
3133 if not os .path .exists (output_dir ):
@@ -44,8 +46,12 @@ def prepare_results(output_dir, otc):
4446 return result_file
4547
4648
47- def correct_template_general (template , dataset ):
49+ def correct_template_general (groundtruth_row ):
4850 # Substitute consecutive variables only if separated with any delimiter including "." (DV)
51+ template = groundtruth_row ['EventTemplate' ]
52+ start = template
53+ if start in _compiled_template_cache :
54+ return _compiled_template_cache [start ]
4955 while True :
5056 prev = template
5157 template = re .sub (r'<\*>\.<\*>' , '<*>' , template )
@@ -57,22 +63,23 @@ def correct_template_general(template, dataset):
5763 #print("CV: ", template)
5864 while True :
5965 prev = template
66+ template = re .sub (r'\s+' , ' ' , template )
6067 template = re .sub (r'<\*><\*>' , '<*>' , template )
6168 template = re .sub (r'<\*>\:<\*>' , '<*>' , template )
6269 template = re .sub (r'<\*> <\*>' , '<*>' , template )
6370 # All
6471 template = re .sub (r'\'<\*>\'' , '<*>' , template )
65- template = re .sub (r'<\*> [KGTM]?B' , '<*>' , template )
72+ template = re .sub (r'<\*> [KGTM]?B\b ' , '<*>' , template )
6673 # HPC
6774 template = re .sub (r'node-<\*>' , '<*>' , template )
6875 template = re .sub (r'node-\[<\*>\]' , '<*>' , template )
6976 # HealthApp
7077 template = re .sub (r'<\*>\#\#<\*>' , '<*>' , template )
7178 template = re .sub (r'<\*>\,<\*>' , '<*>' , template )
72- # OpenStack
73- template = re .sub (r'GET <\*>' , '<*>' , template )
74- template = re .sub (r'POST <\*>' , '<*>' , template )
75- template = re .sub (r'DELETE <\*>' , '<*>' , template )
79+ # Apache and OpenStack
80+ # template = re.sub(r'GET <\*>', '<*>', template)
81+ # template = re.sub(r'POST <\*>', '<*>', template)
82+ # template = re.sub(r'DELETE <\*>', '<*>', template)
7683 # Linux and OpenSSH
7784 template = re .sub (r'tty\=ssh' , 'tty=<*>' , template )
7885 template = re .sub (r'tty\=NODEVssh' , 'tty=<*>' , template )
@@ -82,34 +89,36 @@ def correct_template_general(template, dataset):
8289
8390 while "<*>:<*>" in template :
8491 template = template .replace ("<*>:<*>" , "<*>" )
85-
92+ _compiled_template_cache [ start ] = template
8693 return template
8794
88- def align_with_null_values (groudtruth_row ):
95+
96+ def align_with_null_values (groundtruth_row ):
8997 """
90- Align the null values in the groundtruth with Content.
98+ Align the null values in the groundtruth with Content, optimized with caching .
9199 """
100+ log = groundtruth_row ['Content' ]
101+ template = groundtruth_row ['EventTemplate' ]
92102
93- log = groudtruth_row ['Content' ]
94- template = groudtruth_row ['EventTemplate' ]
95-
96- pattern_parts = template .split ("<*>" )
97- pattern_parts_escaped = [re .escape (part ) for part in pattern_parts ]
98- regex_pattern = "(.*?)" .join (pattern_parts_escaped )
99- regex = "^" + regex_pattern + "$"
100- matches = re .search (regex , log )
101-
102- if matches == None :
103+ if template in _compiled_regex_cache :
104+ regex , pattern_parts = _compiled_regex_cache [template ]
105+ else :
106+ pattern_parts = template .split ("<*>" )
107+ pattern_parts_escaped = [re .escape (part ) for part in pattern_parts ]
108+ regex_pattern = "^" + "(.*?)" .join (pattern_parts_escaped ) + "$"
109+ regex = re .compile (regex_pattern )
110+ _compiled_regex_cache [template ] = (regex , pattern_parts )
111+
112+ matches = regex .search (log )
113+ if not matches :
103114 return template
104115
116+ groups = matches .groups ()
105117 parts = []
106- for index , part in enumerate (template . split ( "<*>" ) ):
118+ for index , part in enumerate (pattern_parts ):
107119 parts .append (part )
108- if index < len (matches .groups ()):
109- if matches .groups ()[index ] == '' :
110- parts .append ('' )
111- else :
112- parts .append ('<*>' )
120+ if index < len (groups ):
121+ parts .append ('' if groups [index ] == '' else '<*>' )
113122 return '' .join (parts )
114123
115124def is_file_empty (file_path ):
@@ -142,7 +151,7 @@ def evaluator(
142151 groundtruth = os .path .join (indir , log_file_basename + '_structured.csv' )
143152
144153 parsedresult = os .path .join (output_dir , log_file_basename + '_structured.csv' )
145-
154+ # parsing_error_result = os.path.join(output_dir, log_file_basename + '_parsing_error.csv')
146155 # if not os.path.exists(parsedresult):
147156 # with open(parsedresult, 'w') as fw:
148157 # pass
@@ -168,12 +177,24 @@ def evaluator(
168177 groundtruth = pd .read_csv (groundtruth , dtype = str )
169178
170179 tqdm .pandas ()
180+ global _compiled_template_cache
181+ global _compiled_regex_cache
182+ _compiled_template_cache = {}
183+ _compiled_regex_cache = {}
171184 # ! temporary removes
172185 print ("Start to align null values and inconsistent labels" )
173186 parsedresult ['EventTemplate' ] = parsedresult .progress_apply (align_with_null_values , axis = 1 )
174187 groundtruth ['EventTemplate' ] = groundtruth .progress_apply (align_with_null_values , axis = 1 )
175- parsedresult ['EventTemplate' ] = parsedresult ['EventTemplate' ].apply (lambda x : correct_template_general (x , dataset ))
176- groundtruth ['EventTemplate' ] = groundtruth ['EventTemplate' ].apply (lambda x : correct_template_general (x , dataset ))
188+ parsedresult ['EventTemplate' ] = parsedresult .progress_apply (correct_template_general , axis = 1 )
189+ groundtruth ['EventTemplate' ] = groundtruth .progress_apply (correct_template_general , axis = 1 )
190+
191+ # output errors
192+ # items = []
193+ # for index, row in groundtruth.iterrows():
194+ # if row['EventTemplate'] != parsedresult.iloc[index]['EventTemplate']:
195+ # items.append([str(row['Content']), str(row['EventTemplate']), str(parsedresult.iloc[index]['EventTemplate'])])
196+ # template_df = pd.DataFrame(items, columns=['Log', 'GroundTruth', 'EventTemplate'])
197+ # template_df[['Log', 'GroundTruth', 'EventTemplate']].to_csv(parsing_error_result, index=False)
177198
178199 # calculate Edit Distance
179200 print ('Calculating Edit Distance....' )
0 commit comments