Skip to content

Commit e0ed100

Browse files
committed
update
1 parent 038f297 commit e0ed100

16 files changed

Lines changed: 312 additions & 378 deletions

.gitignore

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,14 @@ outputs/
44
__pycache__/
55
dist/
66
*.tar
7-
7+
docs/exps/
88
cache_info.log
99
cache_test.py
1010
temp_eval.sh
1111
test.ipynb
12-
datasets/loghub-2.0/*
12+
tools.ipynb
13+
datasets/loghub-2.0/*
14+
demo_rs.py
15+
draw.ipynb
16+
parsing_demo.sh
17+
parsing.sh

benchmark.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,14 @@ def set_args():
1212
help='the Large Lauguage model used in LogBatcher, default to be gpt-4o-mini.')
1313
parser.add_argument('--batch_size', type=int, default=10,
1414
help='The size of a batch.')
15+
parser.add_argument('--min_size', type=int, default=3,
16+
help='Minimum size of logs in a batch.')
1517
parser.add_argument('--sample_method', type=str, default='dpp', choices=['dpp', 'random', 'similar'],
1618
help='Sample method: dpp, random, similar.')
1719
parser.add_argument('--chunk_size', type=int, default=10000,
1820
help='Size of logs in a chunk.')
21+
parser.add_argument('--benchmark_mode', type=int, default=0,
22+
help='different setting')
1923
parser.add_argument('--config', type=str, default="null")
2024
args = parser.parse_args()
2125
return args
@@ -28,7 +32,7 @@ def set_args():
2832

2933
# output dir
3034
if args.config == 'null':
31-
output_folder = f"logb2_{args.model.split('/')[-1].replace('.','_').replace('-','_')}"
35+
output_folder = f"logb2_minsize{args.min_size}"
3236
else:
3337
output_folder = args.config
3438
output_dir = f'outputs/parser/{output_folder}/'
@@ -41,7 +45,6 @@ def set_args():
4145
if os.path.exists(f'{output_dir}{dataset}_full.log_structured.csv'):
4246
print(f'{dataset} has been parsed, skip it.')
4347
continue
44-
4548
structured_log_file = f'datasets/loghub-2.0/{dataset}/{dataset}_full.log_structured.csv'
4649

4750
log_file_format = 'structured'
@@ -65,6 +68,8 @@ def set_args():
6568
batch_size=args.batch_size,
6669
chunk_size=args.chunk_size,
6770
sample_method = args.sample_method,
71+
min_size = args.min_size,
72+
benchmark_mode = args.benchmark_mode,
6873
)
6974
print('time cost by llm: ', parser.time_consumption_llm)
7075
parser.time_consumption_llm = 0

config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"api_key_from_openai": "<OpenAI_API_KEY>",
2+
"api_key_from_openai": "",
33
"api_key_from_together":"<Together_API_KEY>",
44
"datasets_format" : {
55
"HDFS": "<Date> <Time> <Pid> <Level> <Component>: <Content>",

demo.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@ def set_args():
1212
help='the Large Lauguage model used in LogBatcher, default to be gpt-4o-mini.')
1313
parser.add_argument('--dataset', type=str, default='Proxifier')
1414
parser.add_argument('--folder', type=str, default='test')
15+
parser.add_argument('--benchmark_mode', type=int, default=0,
16+
help='different setting')
1517
args = parser.parse_args()
1618
return args
1719

@@ -41,4 +43,5 @@ def set_args():
4143
output_dir= f'outputs/parser/{folder_name}/',
4244
parser=parser,
4345
debug=True,
46+
benchmark_mode=args.benchmark_mode
4447
)

evaluation/logbatcher_eval.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,25 @@
2323
from evaluation.utils.evaluator_main import evaluator, prepare_results
2424
from evaluation.utils.postprocess import post_average
2525

26+
def should_skip_dataset(result_file, dataset):
27+
"""Check if the dataset should be skipped based on the result file."""
28+
if not os.path.exists(result_file):
29+
return False
30+
with open(result_file, 'r') as file:
31+
lines = file.readlines()
32+
for line in lines:
33+
if line.startswith(dataset):
34+
parts = line.strip().split(',')
35+
# Check if all other fields are not empty or None
36+
if all(part not in ('', 'None') for part in parts[1:]):
37+
return True
38+
else:
39+
# Remove the line if other fields are empty or None
40+
lines.remove(line)
41+
with open(result_file, 'w') as file:
42+
file.writelines(lines)
43+
return False
44+
return False
2645

2746
datasets_2k = [
2847
"Proxifier",
@@ -65,7 +84,13 @@
6584
if args.dataset != "null":
6685
datasets = [args.dataset]
6786

87+
88+
6889
for dataset in datasets:
90+
if should_skip_dataset(os.path.join(output_dir, result_file), dataset):
91+
print(f"Skipping dataset {dataset} as it already has valid results.")
92+
continue
93+
6994
setting = benchmark_settings[dataset]
7095
log_file = setting['log_file'].replace("_2k", f"_{args.data_type}")
7196
if os.path.exists(os.path.join(output_dir, f"{dataset}.log_structured.csv")):

evaluation/utils/evaluator_main.py

Lines changed: 49 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
from evaluation.utils.ED_calculator import calculate_edit_distance
2727
import pandas as pd
2828

29+
_compiled_template_cache = {}
30+
_compiled_regex_cache = {}
2931

3032
def prepare_results(output_dir, otc):
3133
if not os.path.exists(output_dir):
@@ -44,8 +46,12 @@ def prepare_results(output_dir, otc):
4446
return result_file
4547

4648

47-
def correct_template_general(template, dataset):
49+
def correct_template_general(groundtruth_row):
4850
# Substitute consecutive variables only if separated with any delimiter including "." (DV)
51+
template = groundtruth_row['EventTemplate']
52+
start = template
53+
if start in _compiled_template_cache:
54+
return _compiled_template_cache[start]
4955
while True:
5056
prev = template
5157
template = re.sub(r'<\*>\.<\*>', '<*>', template)
@@ -57,22 +63,23 @@ def correct_template_general(template, dataset):
5763
#print("CV: ", template)
5864
while True:
5965
prev = template
66+
template = re.sub(r'\s+', ' ', template)
6067
template = re.sub(r'<\*><\*>', '<*>', template)
6168
template = re.sub(r'<\*>\:<\*>', '<*>', template)
6269
template = re.sub(r'<\*> <\*>', '<*>', template)
6370
# All
6471
template = re.sub(r'\'<\*>\'', '<*>', template)
65-
template = re.sub(r'<\*> [KGTM]?B', '<*>', template)
72+
template = re.sub(r'<\*> [KGTM]?B\b', '<*>', template)
6673
# HPC
6774
template = re.sub(r'node-<\*>', '<*>', template)
6875
template = re.sub(r'node-\[<\*>\]', '<*>', template)
6976
# HealthApp
7077
template = re.sub(r'<\*>\#\#<\*>', '<*>', template)
7178
template = re.sub(r'<\*>\,<\*>', '<*>', template)
72-
# OpenStack
73-
template = re.sub(r'GET <\*>', '<*>', template)
74-
template = re.sub(r'POST <\*>', '<*>', template)
75-
template = re.sub(r'DELETE <\*>', '<*>', template)
79+
# Apache and OpenStack
80+
# template = re.sub(r'GET <\*>', '<*>', template)
81+
# template = re.sub(r'POST <\*>', '<*>', template)
82+
# template = re.sub(r'DELETE <\*>', '<*>', template)
7683
# Linux and OpenSSH
7784
template = re.sub(r'tty\=ssh', 'tty=<*>', template)
7885
template = re.sub(r'tty\=NODEVssh', 'tty=<*>', template)
@@ -82,34 +89,36 @@ def correct_template_general(template, dataset):
8289

8390
while "<*>:<*>" in template:
8491
template = template.replace("<*>:<*>", "<*>")
85-
92+
_compiled_template_cache[start] = template
8693
return template
8794

88-
def align_with_null_values(groudtruth_row):
95+
96+
def align_with_null_values(groundtruth_row):
8997
"""
90-
Align the null values in the groundtruth with Content.
98+
Align the null values in the groundtruth with Content, optimized with caching.
9199
"""
100+
log = groundtruth_row['Content']
101+
template = groundtruth_row['EventTemplate']
92102

93-
log = groudtruth_row['Content']
94-
template = groudtruth_row['EventTemplate']
95-
96-
pattern_parts = template.split("<*>")
97-
pattern_parts_escaped = [re.escape(part) for part in pattern_parts]
98-
regex_pattern = "(.*?)".join(pattern_parts_escaped)
99-
regex = "^" + regex_pattern + "$"
100-
matches = re.search(regex, log)
101-
102-
if matches == None:
103+
if template in _compiled_regex_cache:
104+
regex, pattern_parts = _compiled_regex_cache[template]
105+
else:
106+
pattern_parts = template.split("<*>")
107+
pattern_parts_escaped = [re.escape(part) for part in pattern_parts]
108+
regex_pattern = "^" + "(.*?)".join(pattern_parts_escaped) + "$"
109+
regex = re.compile(regex_pattern)
110+
_compiled_regex_cache[template] = (regex, pattern_parts)
111+
112+
matches = regex.search(log)
113+
if not matches:
103114
return template
104115

116+
groups = matches.groups()
105117
parts = []
106-
for index, part in enumerate(template.split("<*>")):
118+
for index, part in enumerate(pattern_parts):
107119
parts.append(part)
108-
if index < len(matches.groups()):
109-
if matches.groups()[index] == '':
110-
parts.append('')
111-
else:
112-
parts.append('<*>')
120+
if index < len(groups):
121+
parts.append('' if groups[index] == '' else '<*>')
113122
return ''.join(parts)
114123

115124
def is_file_empty(file_path):
@@ -142,7 +151,7 @@ def evaluator(
142151
groundtruth = os.path.join(indir, log_file_basename + '_structured.csv')
143152

144153
parsedresult = os.path.join(output_dir, log_file_basename + '_structured.csv')
145-
154+
# parsing_error_result = os.path.join(output_dir, log_file_basename + '_parsing_error.csv')
146155
# if not os.path.exists(parsedresult):
147156
# with open(parsedresult, 'w') as fw:
148157
# pass
@@ -168,12 +177,24 @@ def evaluator(
168177
groundtruth = pd.read_csv(groundtruth, dtype=str)
169178

170179
tqdm.pandas()
180+
global _compiled_template_cache
181+
global _compiled_regex_cache
182+
_compiled_template_cache = {}
183+
_compiled_regex_cache = {}
171184
# ! temporary removes
172185
print("Start to align null values and inconsistent labels")
173186
parsedresult['EventTemplate'] = parsedresult.progress_apply(align_with_null_values, axis=1)
174187
groundtruth['EventTemplate'] = groundtruth.progress_apply(align_with_null_values, axis=1)
175-
parsedresult['EventTemplate'] = parsedresult['EventTemplate'].apply(lambda x: correct_template_general(x, dataset))
176-
groundtruth['EventTemplate'] = groundtruth['EventTemplate'].apply(lambda x: correct_template_general(x, dataset))
188+
parsedresult['EventTemplate'] = parsedresult.progress_apply(correct_template_general, axis=1)
189+
groundtruth['EventTemplate'] = groundtruth.progress_apply(correct_template_general, axis=1)
190+
191+
# output errors
192+
# items = []
193+
# for index, row in groundtruth.iterrows():
194+
# if row['EventTemplate'] != parsedresult.iloc[index]['EventTemplate']:
195+
# items.append([str(row['Content']), str(row['EventTemplate']), str(parsedresult.iloc[index]['EventTemplate'])])
196+
# template_df = pd.DataFrame(items, columns=['Log', 'GroundTruth', 'EventTemplate'])
197+
# template_df[['Log', 'GroundTruth', 'EventTemplate']].to_csv(parsing_error_result, index=False)
177198

178199
# calculate Edit Distance
179200
print('Calculating Edit Distance....')

evaluation/utils/postprocess.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@
33

44
def post_average(metric_file):
55
df = pd.read_csv(metric_file, index_col=False)
6+
# Remove any existing 'Average' rows
7+
df = df[df['Dataset'] != 'Average']
68
df = df.drop_duplicates(['Dataset'])
79
mean_row = df.select_dtypes(include=[np.number]).mean().round(3)
810
new_row = pd.DataFrame([['Average']], columns=['Dataset']).join(pd.DataFrame([mean_row.values], columns=mean_row.index))
911
df = pd.concat([df, new_row], ignore_index=True)
1012
df.to_csv(metric_file, index=False)
11-
# df = pd.read_csv(metric_file)
12-
# transposed_df = df.transpose()
13-
# df.to_csv(metric_file, index=False)

logbatcher/additional_cluster.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,7 +120,7 @@ def assign_labels(clusters, logs, granularity="coarse"):
120120

121121
return labels
122122

123-
def hierichical_clustering(logs, granularity="coarse"):
123+
def hierichical_clustering(logs, granularity="fine"):
124124
contents = {}
125125
for i, x in enumerate(logs):
126126
x, fx = clean(x)

logbatcher/cluster.py

Lines changed: 35 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,25 +4,46 @@
44
from sklearn.metrics.pairwise import cosine_similarity
55
from sklearn.cluster import DBSCAN
66
from logbatcher.sample import group_samples_clustering, dpp_sample
7+
from logbatcher.util import not_varibility
78
import random
8-
99
class Cluster:
1010
def __init__(self):
1111
self.logs = []
1212
self.batch_logs = []
1313
self.indexs = []
1414
self.size = 0
15+
self.sample_log = ''
1516

1617

1718
def append_log(self, log, index):
1819
self.logs.append(log)
1920
self.indexs.append(index)
2021
self.size += 1
21-
22-
def batching(self, batch_size=10, sample_method="dpp"):
22+
23+
def varaible_sampling(self, batch_size=10,sample_method="dpp"):
24+
self.batch_logs = list(OrderedDict.fromkeys(self.logs)) # remove duplicates
25+
def _replacer(match):
26+
char = match.group()
27+
return '0' if char.isdigit() else 'a'
28+
vars = []
29+
for var in self.batch_logs:
30+
vars.append(re.sub(r'[0-9a-zA-Z]', _replacer, var))
31+
vectorizer = TfidfVectorizer()
32+
tfidf_matrix = vectorizer.fit_transform(vars)
33+
tfidf_matrix = tfidf_matrix.toarray()
34+
similarity_matrix = cosine_similarity(tfidf_matrix)
35+
result = dpp_sample(similarity_matrix, batch_size)
36+
self.batch_logs = [self.batch_logs[i] for i in result]
37+
38+
def batching(self, batch_size=10, sample_method="dpp", min_size=3):
2339
self.batch_logs = list(OrderedDict.fromkeys(self.logs)) # remove duplicates
2440
if len(self.batch_logs) > batch_size:
2541
self.sample(batch_size, sample_method)
42+
if type(self.batch_logs) == str:
43+
self.batch_logs = [self.batch_logs]
44+
self.sample_log = self.batch_logs[0]
45+
if not_varibility(self.batch_logs):
46+
self.batch_logs = self.batch_logs[:min_size] if len(self.batch_logs) > min_size else self.batch_logs
2647

2748
def sample(self, batch_size, sample_method):
2849
# vetorize logs
@@ -58,9 +79,10 @@ def tokenize(log_content, tokenize_pattern=r'[ ,|]', removeDight=True):
5879

5980
elif removeDight and re.search(r'\d', word):
6081
pass
61-
elif '/' in word.lower():
82+
elif '/' in word.lower() or re.match(r"^[a-zA-Z][+-]$|^[+-][a-zA-Z]$", word):
6283
pass
6384
else:
85+
word = re.sub(r"\([^)]*\)", "", word)
6486
new_words.append(word)
6587
new_words = [word for word in new_words if word] # remove null
6688
if new_words == []:
@@ -73,7 +95,7 @@ def vectorize(tokenized_logs):
7395
return vectorizer.fit_transform(tokenized_logs)
7496

7597

76-
def cluster(vectorized_logs, eps=0.1):
98+
def cluster(vectorized_logs, eps=0.5):
7799
cluster = DBSCAN(eps=eps, min_samples=5)
78100
cluster.fit(vectorized_logs)
79101
labels = cluster.labels_
@@ -93,4 +115,11 @@ def reassign_clusters(labels, cluster_nums, tokenized_logs):
93115
labels[j] = cluster_nums
94116
labels[i] = cluster_nums
95117
cluster_nums += 1
96-
return labels, cluster_nums
118+
return labels, cluster_nums
119+
120+
def process_new_cluster(new_cluster, clusters, batch_size, sample_method, min_size):
121+
if new_cluster.size != 0:
122+
new_cluster.batching(batch_size, sample_method, min_size)
123+
clusters.append(new_cluster)
124+
return 1
125+
return 0

0 commit comments

Comments
 (0)