#!/usr/bin/env python3 """ Fast Metadata Insert - Direct MySQL ===================================== Voegt metadata records in voor alle bestanden via directe MySQL inserts. Veel sneller dan de MetaVox API (~43M records in ~2 uur). Tabellen: - oc_metavox_file_gf_meta: per-bestand metadata (file_id, groupfolder_id, field_name, field_value) - oc_metavox_gf_metadata: per-teamfolder metadata (groupfolder_id, field_name, field_value) """ import subprocess import time import random import string from datetime import datetime, timedelta DB_NAME = "{{ nextcloud_db_name }}" NUM_TEAMFOLDERS = {{ num_teamfolders }} FILES_PER_FOLDER = {{ files_per_teamfolder }} SQL_BATCH_SIZE = {{ sql_batch_size }} NAMES = ["Jan", "Piet", "Klaas", "Marie", "Anna", "Sophie", "Thomas", "Eva", "Lucas", "Emma", "Liam", "Olivia", "Noah", "Mia", "Daan", "Sara"] ORGS = ["UvA", "HvA", "VU", "TU Delft", "Gemeente Amsterdam", "Ministerie BZK", "Rijkswaterstaat", "UWV", "DUO", "KNAW"] CITIES = ["Amsterdam", "Rotterdam", "Den Haag", "Utrecht", "Eindhoven", "Groningen", "Tilburg", "Almere", "Breda", "Nijmegen"] WORDS = ["beleid", "rapport", "analyse", "voorstel", "evaluatie", "plan", "nota", "brief", "contract", "factuur", "verslag", "advies"] FILE_FIELDS = [ {% for field in file_metadata_fields %} ("{{ field.name }}", "{{ field.type }}{% if field.options is defined %}:{{ field.options }}{% endif %}"), {% endfor %} ] TF_FIELDS = [ {% for field in teamfolder_metadata_fields %} ("{{ field.name }}", "{{ field.type }}{% if field.options is defined %}:{{ field.options }}{% endif %}"), {% endfor %} ] def random_date(): start = datetime(2020, 1, 1) delta = (datetime(2026, 12, 31) - start).days return (start + timedelta(days=random.randint(0, delta))).strftime("%Y-%m-%d") def gen_value(fname, ftype): if ftype.startswith("dropdown:"): options = ftype.split(":", 1)[1].split(",") return random.choice(options) elif ftype == "date": return random_date() elif ftype == "checkbox": return random.choice(["true", "false"]) else: # text if "email" in fname: return f"{random.choice(NAMES).lower()}@example.nl" elif "phone" in fname: return f"+31 6 {random.randint(10000000, 99999999)}" elif "postal" in fname: return f"{random.randint(1000, 9999)} {''.join(random.choices(string.ascii_uppercase, k=2))}" elif "city" in fname: return random.choice(CITIES) elif "country" in fname: return "Nederland" elif "organization" in fname: return random.choice(ORGS) elif any(w in fname for w in ["author", "creator", "contact", "owner"]): return f"{random.choice(NAMES)} {random.choice(['de Vries', 'Jansen', 'Bakker', 'Visser'])}" elif any(w in fname for w in ["number", "code", "identifier", "reference"]): return f"{fname[:3].upper()}-{random.randint(10000, 99999)}" elif "version" in fname: return f"{random.randint(1, 10)}.{random.randint(0, 9)}" elif "language" in fname: return random.choice(["Nederlands", "Engels", "Duits"]) else: return " ".join(random.choices(WORDS, k=random.randint(1, 3))).capitalize() def mysql_exec(sql): result = subprocess.run( ["mysql", "-u", "root", DB_NAME, "-N", "-e", sql], capture_output=True, text=True, timeout=120 ) if result.returncode != 0: print(f" [SQL ERROR] {result.stderr[:200]}") return result.stdout.strip() def mysql_exec_file(filepath): with open(filepath) as f: result = subprocess.run( ["mysql", "-u", "root", DB_NAME, "--max-allowed-packet=64M"], stdin=f, capture_output=True, text=True, timeout=300 ) if result.returncode != 0: print(f" [SQL ERROR] {result.stderr[:200]}") return result def escape_sql(s): return s.replace("\\", "\\\\").replace("'", "\\'").replace('"', '\\"') def main(): print("=" * 60) print("Fast Metadata Insert - Direct MySQL") print(f"File fields: {len(FILE_FIELDS)}, TF fields: {len(TF_FIELDS)}") print(f"Doel: {NUM_TEAMFOLDERS} folders x {FILES_PER_FOLDER} files x {len(FILE_FIELDS)} fields") print(f" = {NUM_TEAMFOLDERS * FILES_PER_FOLDER * len(FILE_FIELDS):,} file metadata rows") print(f" + {NUM_TEAMFOLDERS * len(TF_FIELDS):,} groupfolder metadata rows") print("=" * 60) start_time = time.time() # Step 1: Get groupfolder mapping print("\nStap 1: Ophalen groupfolder mapping...") rows = mysql_exec(""" SELECT folder_id, mount_point FROM oc_group_folders WHERE mount_point LIKE '{{ teamfolder_prefix }}-%' ORDER BY folder_id """) gf_map = {} for line in rows.split('\n'): if line.strip(): parts = line.split('\t') gf_id = int(parts[0]) tf_idx = int(parts[1].split('-')[-1]) gf_map[tf_idx] = gf_id print(f" {len(gf_map)} groupfolders gevonden") # Step 2: Get storage mapping print("\nStap 2: Ophalen storage mapping...") rows = mysql_exec(""" SELECT s.numeric_id, REPLACE(REPLACE(s.id, 'local::/var/www/nextcloud/data/__groupfolders/', ''), '/', '') as folder_num FROM oc_storages s WHERE s.id LIKE 'local::%/__groupfolders/%' """) storage_map = {} for line in rows.split('\n'): if line.strip(): parts = line.split('\t') storage_map[int(parts[1])] = int(parts[0]) # Step 3: Check which folders are already complete print("\nStap 3: Checken welke folders al klaar zijn...") done_rows = mysql_exec(""" SELECT groupfolder_id, COUNT(*) as cnt FROM oc_metavox_file_gf_meta GROUP BY groupfolder_id """) done_gf_ids = set() target_per_folder = FILES_PER_FOLDER * len(FILE_FIELDS) for line in (done_rows or "").split('\n'): if line.strip(): parts = line.split('\t') gf_id = int(parts[0]) cnt = int(parts[1]) if cnt >= target_per_folder * 0.1: done_gf_ids.add(gf_id) print(f" gf_id={gf_id}: {cnt}/{target_per_folder} ({cnt*100//target_per_folder}%) - SKIP") print(f" {len(done_gf_ids)} folders al (grotendeels) compleet, worden overgeslagen") # Step 4: Insert groupfolder metadata print("\nStap 4: Groupfolder metadata invoegen...") now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") values = [] for tf_idx in range(1, NUM_TEAMFOLDERS + 1): gf_id = gf_map.get(tf_idx) if not gf_id: continue for fname, ftype in TF_FIELDS: val = escape_sql(gen_value(fname, ftype)) values.append(f"({gf_id}, '{fname}', '{val}', '{now}', '{now}')") sql_file = "/tmp/gf_metadata_insert.sql" with open(sql_file, 'w') as f: f.write("INSERT INTO oc_metavox_gf_metadata (groupfolder_id, field_name, field_value, created_at, updated_at) VALUES\n") f.write(",\n".join(values)) f.write(";\n") mysql_exec_file(sql_file) print(f" {len(values)} groupfolder metadata records ingevoegd") # Step 5: Insert file metadata per folder print("\nStap 5: File metadata invoegen...") total_inserted = 0 for tf_idx in range(1, NUM_TEAMFOLDERS + 1): gf_id = gf_map.get(tf_idx) if not gf_id: continue if gf_id in done_gf_ids: total_inserted += FILES_PER_FOLDER * len(FILE_FIELDS) continue storage_id = storage_map.get(gf_id) if not storage_id: print(f" [SKIP] Geen storage voor gf_id {gf_id}") continue # Get file IDs for this folder file_ids_raw = mysql_exec(f""" SELECT fileid FROM oc_filecache WHERE storage = {storage_id} AND name LIKE '{{ dummy_file_prefix }}%' """) file_ids = [int(x) for x in file_ids_raw.split('\n') if x.strip()] if not file_ids: print(f" [SKIP] Folder {tf_idx}: geen bestanden") continue # Generate all rows for this folder values = [] for fid in file_ids: for fname, ftype in FILE_FIELDS: val = escape_sql(gen_value(fname, ftype)) values.append(f"({fid}, {gf_id}, '{fname}', '{val}', '{now}', '{now}')") # Write in batches for batch_start in range(0, len(values), SQL_BATCH_SIZE): batch = values[batch_start:batch_start + SQL_BATCH_SIZE] sql_file = "/tmp/file_meta_batch.sql" with open(sql_file, 'w') as f: f.write("INSERT INTO oc_metavox_file_gf_meta (file_id, groupfolder_id, field_name, field_value, created_at, updated_at) VALUES\n") f.write(",\n".join(batch)) f.write(";\n") mysql_exec_file(sql_file) total_inserted += len(values) elapsed = time.time() - start_time rate = total_inserted / elapsed if elapsed > 0 else 0 print(f" [DONE] Folder {tf_idx} (gf_id={gf_id}): {len(file_ids)} files x {len(FILE_FIELDS)} fields = {len(values)} rows ({elapsed:.0f}s, totaal: {total_inserted:,}, {rate:.0f}/s)") elapsed = time.time() - start_time print(f"\n{'='*60}") print(f"Voltooid in {elapsed:.0f} seconden ({elapsed/3600:.1f} uur)") print(f"File metadata records: {total_inserted:,}") print(f"GF metadata records: {len(gf_map) * len(TF_FIELDS)}") print(f"{'='*60}") if __name__ == "__main__": main()