-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathvalidate_jsonls.py
More file actions
executable file
·101 lines (80 loc) · 2.78 KB
/
validate_jsonls.py
File metadata and controls
executable file
·101 lines (80 loc) · 2.78 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python3
import subprocess
import sys
import os
import git
# Define the script to run and the list of JSONL files to validate
# Intended to be run from the root of the repository, as when running the pre-commit hook
# Need to do this hack to import stuff
sys.path.append(".")
from src.common.path_utils import get_data_path, get_src_path, get_root_path
from dotenv import load_dotenv
dotenv_path = str(get_root_path() / ".env")
print(f"Loading dotenv from {dotenv_path}")
load_dotenv(dotenv_path=dotenv_path)
validate_all = os.getenv("VALIDATE_ALL", False) == "True"
script = f"{get_src_path()}/validate_fq_jsonl.py"
jsonl_files = []
real_data_dir = get_data_path() / "fq" / "real"
synthetic_data_dir = get_data_path() / "fq" / "synthetic"
tuple_data_dirs = [path for path in get_data_path().glob("tuple*") if path.is_dir()]
for file in real_data_dir.rglob("*.jsonl"):
jsonl_files.append(
{
"file": str(file),
"tuple": False,
}
)
for file in synthetic_data_dir.rglob("*.jsonl"):
jsonl_files.append(
{
"file": str(file),
"tuple": False,
}
)
for tuple_data_dir in tuple_data_dirs:
for file in tuple_data_dir.rglob("*.jsonl"):
jsonl_files.append(
{
"file": str(file),
"tuple": True,
}
)
print(f"{len(jsonl_files)} jsonl files found")
if validate_all:
jsonl_files_to_check = jsonl_files
else:
# now get the git diff ones
repo = git.Repo(search_parent_directories=True)
diff = repo.git.diff("HEAD", name_only=True)
diff_files: list[str] = diff.splitlines()
diff_files = [f for f in diff_files if f.endswith(".jsonl")]
jsonl_files_to_check = []
for file_path in jsonl_files:
for diff_file in diff_files:
if file_path["file"].endswith(diff_file):
jsonl_files_to_check.append(file_path)
break
print(f"{len(jsonl_files_to_check)} jsonl files to check")
# Function to run the validation script on each file
def validate_file(file_path):
command = ["python3", script, "--filename", str(file_path["file"])]
if file_path["tuple"]:
command.append("--tuple")
result = subprocess.run(command, capture_output=True, text=True)
if result.returncode != 0:
print(
f"Validation failed for {str(file_path['file'])}, tuple={file_path['tuple']}."
)
print(result)
return False
return True
all_valid = True
for file_path in jsonl_files_to_check:
print(f"{file_path}")
if not validate_file(file_path):
all_valid = False
break # Stop on first failure
if not all_valid:
sys.exit(1) # Exit with a non-zero status code to indicate failure
print("Validation successful for all files.")