-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbatch_process_document.py
More file actions
53 lines (40 loc) · 1.62 KB
/
Copy pathbatch_process_document.py
File metadata and controls
53 lines (40 loc) · 1.62 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import os
import shutil
from shared_utils import get_all_document_paths
from data_processing import batch_process_document # We will create this file next
from rebuild_embeddings_and_paragraphs import build_and_cache_embeddings
# --- CONFIGURATION ---
OCR_CACHE_DIR = "data/ocr_cache"
CHROMA_PATH = "data/chroma_db"
DOCUMENT_DIRECTORIES = ["data/documents", "data/uploads"]
def main():
# 1. Clean up old data for a fresh start
print("--- Clearing old cache and database ---")
if os.path.exists(OCR_CACHE_DIR):
shutil.rmtree(OCR_CACHE_DIR)
if os.path.exists(CHROMA_PATH):
shutil.rmtree(CHROMA_PATH)
os.makedirs(OCR_CACHE_DIR, exist_ok=True)
os.makedirs(CHROMA_PATH, exist_ok=True)
print("Old data cleared successfully.")
# 2. Get all documents to be processed
all_docs = get_all_document_paths()
if not all_docs:
print("No documents found to process. Exiting.")
return
print(f"\nFound {len(all_docs)} documents to process.")
# 3. Process each document
for doc in all_docs:
filename = doc['filename']
filepath = doc['path']
print(f"\n--- Processing Document: {filename} ---")
# Step A: Perform bilingual OCR and save to cache
success = batch_process_document(filepath, filename)
if success:
# Step B: Build and cache embeddings for the newly OCR'd text
build_and_cache_embeddings(filename)
else:
print(f"[SKIPPING] Could not perform OCR on {filename}. Skipping embedding generation.")
print("\n\n--- Batch processing complete. ---")
if __name__ == "__main__":
main()