-
Notifications
You must be signed in to change notification settings - Fork 15
Expand file tree
/
Copy pathpreprocess_data.py
More file actions
executable file
·158 lines (132 loc) · 6.33 KB
/
preprocess_data.py
File metadata and controls
executable file
·158 lines (132 loc) · 6.33 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
"""
Simple script to preprocess datasets using PyDimension 2.0.
This is a convenience script that can be run directly from the package root.
Uses unified config files that contain settings for all modules.
"""
import sys
from pathlib import Path
# Add pydimension to path
sys.path.insert(0, str(Path(__file__).parent))
from pydimension.data_preprocessing import DataPreprocessingConfig, run_dimensional_analysis_preprocessing
def main():
"""Preprocess data from config file or use command-line arguments."""
import argparse
parser = argparse.ArgumentParser(
description='Preprocess datasets and run integrated dimensional analysis',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Use unified config file
python preprocess_data.py --config pydimension/configs/config_synthetic.json
# Use default config (auto-detect from data generation output)
python preprocess_data.py
# Specify input file directly
python preprocess_data.py --input_file dataset.csv
# Specify input and output variables
python preprocess_data.py --input_file dataset.csv \\
--input_variables p1 p2 p3 p4 p5 p6 p7 \\
--output_variables "p*"
# Use dimension matrix from file
python preprocess_data.py --input_file dataset.csv \\
--dimension_matrix_file output/data/dimension_matrix_synthetic.csv
# Disable normalization
python preprocess_data.py --input_file dataset.csv --no-normalize
"""
)
parser.add_argument('--config', '-c', type=str, default=None,
help='Path to unified JSON config file. '
'If not specified, uses default configuration.')
parser.add_argument('--input_file', type=str, default=None,
help='Path to input CSV file')
parser.add_argument('--input_variables', type=str, nargs='+', default=None,
help='List of input variable names (space-separated)')
parser.add_argument('--output_variables', type=str, nargs='+', default=None,
help='List of output variable names (space-separated)')
parser.add_argument('--dimension_matrix_file', type=str, default=None,
help='Path to dimension matrix CSV file (optional)')
parser.add_argument('--no-normalize', action='store_true',
help='Disable normalization (default: normalize enabled)')
parser.add_argument('--output_dir', type=str, default=None,
help='Base output directory (overrides config, default: "output")')
parser.add_argument('--normalized_data_filename', type=str, default=None,
help='Filename for normalized data (overrides config)')
parser.add_argument('--dimension_matrix_filename', type=str, default=None,
help='Filename for dimension matrix (overrides config)')
parser.add_argument('--plot', action='store_true',
help='Generate and save visualization plots')
parser.add_argument('--plot_filename', type=str, default=None,
help='Filename for saved plot (default: from config or "data_preprocessing_plots.png")')
args = parser.parse_args()
# Load config
if args.config:
if not Path(args.config).exists():
print(f"Error: Config file not found: {args.config}")
return 1
config = DataPreprocessingConfig.from_json(args.config)
print(f"Loaded config from: {args.config}")
else:
# Use default config
config = DataPreprocessingConfig()
print("Using default configuration. Use --config to specify a config file.")
print("Example: python preprocess_data.py --config pydimension/configs/config_synthetic.json")
# Override config with command-line arguments
if args.input_file:
config.input_file = args.input_file
if args.input_variables:
config.input_variables = args.input_variables
if args.output_variables:
config.output_variables = args.output_variables
if args.dimension_matrix_file:
config.dimension_matrix_file = args.dimension_matrix_file
if args.no_normalize:
config.normalize = False
if args.output_dir:
config.output_dir = args.output_dir
if args.normalized_data_filename:
config.normalized_data_filename = args.normalized_data_filename
if args.dimension_matrix_filename:
config.dimension_matrix_filename = args.dimension_matrix_filename
# Validate config
errors = config.validate()
if errors:
print("Configuration errors:", file=sys.stderr)
for error in errors:
print(f" - {error}", file=sys.stderr)
return 1
# Create preprocessor and run
try:
artifacts = run_dimensional_analysis_preprocessing(config, verbose=True)
# Create visualization if requested
if args.plot:
plot_filename = args.plot_filename
if plot_filename is None:
# Try to get from config if available
try:
import json
if args.config:
with open(args.config, 'r') as f:
full_config = json.load(f)
plot_filename = full_config.get('DATA_PREPROCESSING_OUTPUT', {}).get('plot_filename')
except:
pass
if plot_filename is None:
plot_filename = 'data_preprocessing_plots.png'
from pydimension.data_preprocessing import DataPreprocessor
preprocessor = DataPreprocessor(config)
preprocessor.process(verbose=False)
plot_path = preprocessor.create_visualization(filename=plot_filename)
print(f"Plot: {plot_path}")
print(f"\n=== Files Saved ===")
print(f"Normalized data: {artifacts.normalized_data_file}")
print(f"Dimension matrix: {artifacts.dimension_matrix_file}")
print(f"Basis vectors: {artifacts.basis_vectors_file}")
print(f"Normalized lg data: {artifacts.normalized_lg_data_file}")
return 0
except Exception as e:
print(f"Error: {e}", file=sys.stderr)
import traceback
traceback.print_exc()
return 1
if __name__ == '__main__':
sys.exit(main())