LogIntelligence
diff --git a/‎.gitignore‎
Lines changed: 7 additions & 2 deletions b/‎.gitignore‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎benchmark.py‎
Lines changed: 7 additions & 2 deletions b/‎benchmark.py‎
Lines changed: 7 additions & 2 deletions
diff --git a/‎config.json‎
Lines changed: 1 addition & 1 deletion b/‎config.json‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎demo.py‎
Lines changed: 3 additions & 0 deletions b/‎demo.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎evaluation/logbatcher_eval.py‎
Lines changed: 25 additions & 0 deletions b/‎evaluation/logbatcher_eval.py‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎evaluation/utils/evaluator_main.py‎
Lines changed: 49 additions & 28 deletions b/‎evaluation/utils/evaluator_main.py‎
Lines changed: 49 additions & 28 deletions
diff --git a/‎evaluation/utils/postprocess.py‎
Lines changed: 2 additions & 3 deletions b/‎evaluation/utils/postprocess.py‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎logbatcher/additional_cluster.py‎
Lines changed: 1 addition & 1 deletion b/‎logbatcher/additional_cluster.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎logbatcher/cluster.py‎
Lines changed: 35 additions & 6 deletions b/‎logbatcher/cluster.py‎
Lines changed: 35 additions & 6 deletions
@@ -4,9 +4,14 @@ outputs/
 __pycache__/
 dist/
 *.tar
-
+docs/exps/
 cache_info.log
 cache_test.py
 temp_eval.sh
 test.ipynb
-datasets/loghub-2.0/*
+tools.ipynb
+datasets/loghub-2.0/*
+demo_rs.py
+draw.ipynb
+parsing_demo.sh
+parsing.sh
@@ -12,10 +12,14 @@ def set_args():
                         help='the Large Lauguage model used in LogBatcher, default to be gpt-4o-mini.')
     parser.add_argument('--batch_size', type=int, default=10, 
                         help='The size of a batch.')
+    parser.add_argument('--min_size', type=int, default=3,
+                        help='Minimum size of logs in a batch.')
     parser.add_argument('--sample_method', type=str, default='dpp', choices=['dpp', 'random', 'similar'],
                         help='Sample method: dpp, random, similar.')
     parser.add_argument('--chunk_size', type=int, default=10000,
                         help='Size of logs in a chunk.')
+    parser.add_argument('--benchmark_mode', type=int, default=0,
+                        help='different setting')
     parser.add_argument('--config', type=str, default="null")
     args = parser.parse_args()
     return args
@@ -28,7 +32,7 @@ def set_args():
 
     # output dir
     if args.config == 'null':
-        output_folder = f"logb2_{args.model.split('/')[-1].replace('.','_').replace('-','_')}"
+        output_folder = f"logb2_minsize{args.min_size}"
     else:
         output_folder = args.config
     output_dir = f'outputs/parser/{output_folder}/'
@@ -41,7 +45,6 @@ def set_args():
         if os.path.exists(f'{output_dir}{dataset}_full.log_structured.csv'):
             print(f'{dataset} has been parsed, skip it.')
             continue
-
         structured_log_file = f'datasets/loghub-2.0/{dataset}/{dataset}_full.log_structured.csv'
 
         log_file_format = 'structured'
@@ -65,6 +68,8 @@ def set_args():
             batch_size=args.batch_size,
             chunk_size=args.chunk_size,
             sample_method = args.sample_method,
+            min_size = args.min_size,
+            benchmark_mode = args.benchmark_mode,
         )
         print('time cost by llm: ', parser.time_consumption_llm)
         parser.time_consumption_llm = 0
@@ -1,5 +1,5 @@
 {
-    "api_key_from_openai": "<OpenAI_API_KEY>",
+    "api_key_from_openai": "",
     "api_key_from_together":"<Together_API_KEY>",
     "datasets_format" : {
         "HDFS": "<Date> <Time> <Pid> <Level> <Component>: <Content>",
 
@@ -12,6 +12,8 @@ def set_args():
                         help='the Large Lauguage model used in LogBatcher, default to be gpt-4o-mini.')
     parser.add_argument('--dataset', type=str, default='Proxifier')
     parser.add_argument('--folder', type=str, default='test')
+    parser.add_argument('--benchmark_mode', type=int, default=0,
+                        help='different setting')
     args = parser.parse_args()
     return args
 
@@ -41,4 +43,5 @@ def set_args():
         output_dir= f'outputs/parser/{folder_name}/',
         parser=parser,
         debug=True,
+        benchmark_mode=args.benchmark_mode
     )
@@ -23,6 +23,25 @@
 from evaluation.utils.evaluator_main import evaluator, prepare_results
 from evaluation.utils.postprocess import post_average
 
+def should_skip_dataset(result_file, dataset):
+    """Check if the dataset should be skipped based on the result file."""
+    if not os.path.exists(result_file):
+        return False
+    with open(result_file, 'r') as file:
+        lines = file.readlines()
+    for line in lines:
+        if line.startswith(dataset):
+            parts = line.strip().split(',')
+            # Check if all other fields are not empty or None
+            if all(part not in ('', 'None') for part in parts[1:]):
+                return True
+            else:
+                # Remove the line if other fields are empty or None
+                lines.remove(line)
+                with open(result_file, 'w') as file:
+                    file.writelines(lines)
+                return False
+    return False
 
 datasets_2k = [
     "Proxifier",
@@ -65,7 +84,13 @@
     if args.dataset != "null":
         datasets = [args.dataset]
 
+
+
     for dataset in datasets:
+        if should_skip_dataset(os.path.join(output_dir, result_file), dataset):
+            print(f"Skipping dataset {dataset} as it already has valid results.")
+            continue
+
         setting = benchmark_settings[dataset]
         log_file = setting['log_file'].replace("_2k", f"_{args.data_type}")
         if os.path.exists(os.path.join(output_dir, f"{dataset}.log_structured.csv")):
 
@@ -26,6 +26,8 @@
 from evaluation.utils.ED_calculator import calculate_edit_distance
 import pandas as pd
 
+_compiled_template_cache = {}
+_compiled_regex_cache = {}
 
 def prepare_results(output_dir, otc):
     if not os.path.exists(output_dir):
@@ -44,8 +46,12 @@ def prepare_results(output_dir, otc):
     return result_file
 
 
-def correct_template_general(template, dataset):
+def correct_template_general(groundtruth_row):
     # Substitute consecutive variables only if separated with any delimiter including "." (DV)
+    template = groundtruth_row['EventTemplate']
+    start = template
+    if start in _compiled_template_cache:
+        return _compiled_template_cache[start]
     while True:
         prev = template
         template = re.sub(r'<\*>\.<\*>', '<*>', template)
@@ -57,22 +63,23 @@ def correct_template_general(template, dataset):
     #print("CV: ", template)
     while True:
         prev = template
+        template = re.sub(r'\s+', ' ', template)
         template = re.sub(r'<\*><\*>', '<*>', template)
         template = re.sub(r'<\*>\:<\*>', '<*>', template)
         template = re.sub(r'<\*> <\*>', '<*>', template)
         # All
         template = re.sub(r'\'<\*>\'', '<*>', template)
-        template = re.sub(r'<\*> [KGTM]?B', '<*>', template)
+        template = re.sub(r'<\*> [KGTM]?B\b', '<*>', template)
         # HPC
         template = re.sub(r'node-<\*>', '<*>', template)
         template = re.sub(r'node-\[<\*>\]', '<*>', template)
         # HealthApp
         template = re.sub(r'<\*>\#\#<\*>', '<*>', template)
         template = re.sub(r'<\*>\,<\*>', '<*>', template)
-        # OpenStack
-        template = re.sub(r'GET <\*>', '<*>', template)
-        template = re.sub(r'POST <\*>', '<*>', template)
-        template = re.sub(r'DELETE <\*>', '<*>', template)
+        # Apache and OpenStack
+        # template = re.sub(r'GET <\*>', '<*>', template)
+        # template = re.sub(r'POST <\*>', '<*>', template)
+        # template = re.sub(r'DELETE <\*>', '<*>', template)
         # Linux and OpenSSH
         template = re.sub(r'tty\=ssh', 'tty=<*>', template)
         template = re.sub(r'tty\=NODEVssh', 'tty=<*>', template)
@@ -82,34 +89,36 @@ def correct_template_general(template, dataset):
 
     while "<*>:<*>" in template:
         template = template.replace("<*>:<*>", "<*>")
-
+    _compiled_template_cache[start] = template
     return template
 
-def align_with_null_values(groudtruth_row):
+
+def align_with_null_values(groundtruth_row):
     """
-    Align the null values in the groundtruth with Content.
+    Align the null values in the groundtruth with Content, optimized with caching.
     """
+    log = groundtruth_row['Content']
+    template = groundtruth_row['EventTemplate']
 
-    log = groudtruth_row['Content']
-    template = groudtruth_row['EventTemplate']
-
-    pattern_parts = template.split("<*>")
-    pattern_parts_escaped = [re.escape(part) for part in pattern_parts]
-    regex_pattern = "(.*?)".join(pattern_parts_escaped)
-    regex = "^" + regex_pattern + "$"  
-    matches = re.search(regex, log)
-
-    if matches == None:
+    if template in _compiled_regex_cache:
+        regex, pattern_parts = _compiled_regex_cache[template]
+    else:
+        pattern_parts = template.split("<*>")
+        pattern_parts_escaped = [re.escape(part) for part in pattern_parts]
+        regex_pattern = "^" + "(.*?)".join(pattern_parts_escaped) + "$"
+        regex = re.compile(regex_pattern)
+        _compiled_regex_cache[template] = (regex, pattern_parts)
+
+    matches = regex.search(log)
+    if not matches:
         return template
 
+    groups = matches.groups()
     parts = []
-    for index, part in enumerate(template.split("<*>")):
+    for index, part in enumerate(pattern_parts):
         parts.append(part)
-        if index < len(matches.groups()):
-            if matches.groups()[index] == '':
-                parts.append('')
-            else:
-                parts.append('<*>')
+        if index < len(groups):
+            parts.append('' if groups[index] == '' else '<*>')
     return ''.join(parts)
 
 def is_file_empty(file_path):
@@ -142,7 +151,7 @@ def evaluator(
         groundtruth = os.path.join(indir, log_file_basename + '_structured.csv')
 
     parsedresult = os.path.join(output_dir, log_file_basename + '_structured.csv')
-
+    # parsing_error_result = os.path.join(output_dir, log_file_basename + '_parsing_error.csv')
     # if not os.path.exists(parsedresult):
     #     with open(parsedresult, 'w') as fw:
     #         pass
@@ -168,12 +177,24 @@ def evaluator(
     groundtruth = pd.read_csv(groundtruth, dtype=str)
 
     tqdm.pandas()
+    global _compiled_template_cache
+    global _compiled_regex_cache
+    _compiled_template_cache = {}
+    _compiled_regex_cache = {}
     # ! temporary removes
     print("Start to align null values and inconsistent labels")
     parsedresult['EventTemplate'] = parsedresult.progress_apply(align_with_null_values, axis=1)
     groundtruth['EventTemplate'] = groundtruth.progress_apply(align_with_null_values, axis=1)
-    parsedresult['EventTemplate'] = parsedresult['EventTemplate'].apply(lambda x: correct_template_general(x, dataset))
-    groundtruth['EventTemplate'] = groundtruth['EventTemplate'].apply(lambda x: correct_template_general(x, dataset))
+    parsedresult['EventTemplate'] = parsedresult.progress_apply(correct_template_general, axis=1)
+    groundtruth['EventTemplate'] = groundtruth.progress_apply(correct_template_general, axis=1)
+
+    # output errors
+    # items = []
+    # for index, row in groundtruth.iterrows():
+    #     if row['EventTemplate'] != parsedresult.iloc[index]['EventTemplate']:
+    #         items.append([str(row['Content']), str(row['EventTemplate']), str(parsedresult.iloc[index]['EventTemplate'])])
+    # template_df = pd.DataFrame(items, columns=['Log', 'GroundTruth', 'EventTemplate'])
+    # template_df[['Log', 'GroundTruth', 'EventTemplate']].to_csv(parsing_error_result, index=False)
 
     # calculate Edit Distance
     print('Calculating Edit Distance....')
 
@@ -3,11 +3,10 @@
 
 def post_average(metric_file):
     df = pd.read_csv(metric_file, index_col=False)
+    # Remove any existing 'Average' rows
+    df = df[df['Dataset'] != 'Average']
     df = df.drop_duplicates(['Dataset'])
     mean_row = df.select_dtypes(include=[np.number]).mean().round(3)
     new_row = pd.DataFrame([['Average']], columns=['Dataset']).join(pd.DataFrame([mean_row.values], columns=mean_row.index))
     df = pd.concat([df, new_row], ignore_index=True)
     df.to_csv(metric_file, index=False)
-    # df = pd.read_csv(metric_file)
-    # transposed_df = df.transpose()
-    # df.to_csv(metric_file, index=False)
@@ -120,7 +120,7 @@ def assign_labels(clusters, logs, granularity="coarse"):
 
     return labels
 
-def hierichical_clustering(logs, granularity="coarse"):
+def hierichical_clustering(logs, granularity="fine"):
     contents = {}
     for i, x in enumerate(logs):
         x, fx = clean(x)
 
@@ -4,25 +4,46 @@
 from sklearn.metrics.pairwise import cosine_similarity
 from sklearn.cluster import DBSCAN
 from logbatcher.sample import group_samples_clustering, dpp_sample
+from logbatcher.util import not_varibility
 import random
-
 class Cluster:
     def __init__(self):
         self.logs = []
         self.batch_logs = []
         self.indexs = []
         self.size = 0
+        self.sample_log = ''
 
 
     def append_log(self, log, index):
         self.logs.append(log)
         self.indexs.append(index)
         self.size += 1
-    
-    def batching(self, batch_size=10, sample_method="dpp"):
+
+    def varaible_sampling(self, batch_size=10,sample_method="dpp"):
+        self.batch_logs = list(OrderedDict.fromkeys(self.logs)) # remove duplicates
+        def _replacer(match):
+            char = match.group()
+            return '0' if char.isdigit() else 'a'
+        vars = []
+        for var in self.batch_logs:
+            vars.append(re.sub(r'[0-9a-zA-Z]', _replacer, var))
+        vectorizer = TfidfVectorizer()
+        tfidf_matrix = vectorizer.fit_transform(vars)
+        tfidf_matrix = tfidf_matrix.toarray()
+        similarity_matrix = cosine_similarity(tfidf_matrix)
+        result = dpp_sample(similarity_matrix, batch_size)
+        self.batch_logs = [self.batch_logs[i] for i in result]
+
+    def batching(self, batch_size=10, sample_method="dpp", min_size=3):
         self.batch_logs = list(OrderedDict.fromkeys(self.logs)) # remove duplicates
         if len(self.batch_logs) > batch_size:
             self.sample(batch_size, sample_method)
+        if type(self.batch_logs) == str:
+            self.batch_logs = [self.batch_logs]
+        self.sample_log = self.batch_logs[0]
+        if not_varibility(self.batch_logs):
+            self.batch_logs = self.batch_logs[:min_size] if len(self.batch_logs) > min_size else self.batch_logs
 
     def sample(self, batch_size, sample_method):
         # vetorize logs
@@ -58,9 +79,10 @@ def tokenize(log_content, tokenize_pattern=r'[ ,|]', removeDight=True):
 
         elif removeDight and re.search(r'\d', word):
             pass
-        elif '/' in word.lower():
+        elif '/' in word.lower() or re.match(r"^[a-zA-Z][+-]$|^[+-][a-zA-Z]$", word):
             pass
         else:
+            word = re.sub(r"\([^)]*\)", "", word)
             new_words.append(word)
     new_words = [word for word in new_words if word]   # remove null
     if new_words == []:
@@ -73,7 +95,7 @@ def vectorize(tokenized_logs):
     return vectorizer.fit_transform(tokenized_logs)
 
 
-def cluster(vectorized_logs, eps=0.1):
+def cluster(vectorized_logs, eps=0.5):
     cluster = DBSCAN(eps=eps, min_samples=5)
     cluster.fit(vectorized_logs)
     labels = cluster.labels_
@@ -93,4 +115,11 @@ def reassign_clusters(labels, cluster_nums, tokenized_logs):
                     labels[j] = cluster_nums
             labels[i] = cluster_nums
             cluster_nums += 1
-    return labels, cluster_nums
+    return labels, cluster_nums
+
+def process_new_cluster(new_cluster, clusters, batch_size, sample_method, min_size):
+    if new_cluster.size != 0:
+        new_cluster.batching(batch_size, sample_method, min_size)
+        clusters.append(new_cluster)
+        return 1
+    return 0
Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`{`
`2`		`- "api_key_from_openai": "<OpenAI_API_KEY>",`
	`2`	`+ "api_key_from_openai": "",`
`3`	`3`	`"api_key_from_together":"<Together_API_KEY>",`
`4`	`4`	`"datasets_format" : {`
`5`	`5`	`"HDFS": "<Date> <Time> <Pid> <Level> <Component>: <Content>",`