Ansible playbook voor het opzetten van een MetaVox loadtest omgeving: - 50 teamfolders met 10.000 bestanden elk (500K totaal) - 100 metadata velddefinities (10 teamfolder + 90 file-level) - 3-niveau mappenstructuur (10 hoofdmappen x 3 submappen) - ~43M metadata records via directe MySQL inserts - Geoptimaliseerde database indexes (7 redundante indexes gedropt) Gebruikt directe filesystem writes en MySQL inserts i.p.v. WebDAV/API voor maximale performance. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
248 lines
9.4 KiB
Django/Jinja
248 lines
9.4 KiB
Django/Jinja
#!/usr/bin/env python3
|
|
"""
|
|
Fast Metadata Insert - Direct MySQL
|
|
=====================================
|
|
Voegt metadata records in voor alle bestanden via directe MySQL inserts.
|
|
Veel sneller dan de MetaVox API (~43M records in ~2 uur).
|
|
|
|
Tabellen:
|
|
- oc_metavox_file_gf_meta: per-bestand metadata (file_id, groupfolder_id, field_name, field_value)
|
|
- oc_metavox_gf_metadata: per-teamfolder metadata (groupfolder_id, field_name, field_value)
|
|
"""
|
|
import subprocess
|
|
import time
|
|
import random
|
|
import string
|
|
from datetime import datetime, timedelta
|
|
|
|
DB_NAME = "{{ nextcloud_db_name }}"
|
|
NUM_TEAMFOLDERS = {{ num_teamfolders }}
|
|
FILES_PER_FOLDER = {{ files_per_teamfolder }}
|
|
SQL_BATCH_SIZE = {{ sql_batch_size }}
|
|
|
|
NAMES = ["Jan", "Piet", "Klaas", "Marie", "Anna", "Sophie", "Thomas", "Eva",
|
|
"Lucas", "Emma", "Liam", "Olivia", "Noah", "Mia", "Daan", "Sara"]
|
|
ORGS = ["UvA", "HvA", "VU", "TU Delft", "Gemeente Amsterdam", "Ministerie BZK",
|
|
"Rijkswaterstaat", "UWV", "DUO", "KNAW"]
|
|
CITIES = ["Amsterdam", "Rotterdam", "Den Haag", "Utrecht", "Eindhoven",
|
|
"Groningen", "Tilburg", "Almere", "Breda", "Nijmegen"]
|
|
WORDS = ["beleid", "rapport", "analyse", "voorstel", "evaluatie", "plan",
|
|
"nota", "brief", "contract", "factuur", "verslag", "advies"]
|
|
|
|
FILE_FIELDS = [
|
|
{% for field in file_metadata_fields %}
|
|
("{{ field.name }}", "{{ field.type }}{% if field.options is defined %}:{{ field.options }}{% endif %}"),
|
|
{% endfor %}
|
|
]
|
|
|
|
TF_FIELDS = [
|
|
{% for field in teamfolder_metadata_fields %}
|
|
("{{ field.name }}", "{{ field.type }}{% if field.options is defined %}:{{ field.options }}{% endif %}"),
|
|
{% endfor %}
|
|
]
|
|
|
|
|
|
def random_date():
|
|
start = datetime(2020, 1, 1)
|
|
delta = (datetime(2026, 12, 31) - start).days
|
|
return (start + timedelta(days=random.randint(0, delta))).strftime("%Y-%m-%d")
|
|
|
|
|
|
def gen_value(fname, ftype):
|
|
if ftype.startswith("dropdown:"):
|
|
options = ftype.split(":", 1)[1].split(",")
|
|
return random.choice(options)
|
|
elif ftype == "date":
|
|
return random_date()
|
|
elif ftype == "checkbox":
|
|
return random.choice(["true", "false"])
|
|
else: # text
|
|
if "email" in fname:
|
|
return f"{random.choice(NAMES).lower()}@example.nl"
|
|
elif "phone" in fname:
|
|
return f"+31 6 {random.randint(10000000, 99999999)}"
|
|
elif "postal" in fname:
|
|
return f"{random.randint(1000, 9999)} {''.join(random.choices(string.ascii_uppercase, k=2))}"
|
|
elif "city" in fname:
|
|
return random.choice(CITIES)
|
|
elif "country" in fname:
|
|
return "Nederland"
|
|
elif "organization" in fname:
|
|
return random.choice(ORGS)
|
|
elif any(w in fname for w in ["author", "creator", "contact", "owner"]):
|
|
return f"{random.choice(NAMES)} {random.choice(['de Vries', 'Jansen', 'Bakker', 'Visser'])}"
|
|
elif any(w in fname for w in ["number", "code", "identifier", "reference"]):
|
|
return f"{fname[:3].upper()}-{random.randint(10000, 99999)}"
|
|
elif "version" in fname:
|
|
return f"{random.randint(1, 10)}.{random.randint(0, 9)}"
|
|
elif "language" in fname:
|
|
return random.choice(["Nederlands", "Engels", "Duits"])
|
|
else:
|
|
return " ".join(random.choices(WORDS, k=random.randint(1, 3))).capitalize()
|
|
|
|
|
|
def mysql_exec(sql):
|
|
result = subprocess.run(
|
|
["mysql", "-u", "root", DB_NAME, "-N", "-e", sql],
|
|
capture_output=True, text=True, timeout=120
|
|
)
|
|
if result.returncode != 0:
|
|
print(f" [SQL ERROR] {result.stderr[:200]}")
|
|
return result.stdout.strip()
|
|
|
|
|
|
def mysql_exec_file(filepath):
|
|
with open(filepath) as f:
|
|
result = subprocess.run(
|
|
["mysql", "-u", "root", DB_NAME, "--max-allowed-packet=64M"],
|
|
stdin=f, capture_output=True, text=True, timeout=300
|
|
)
|
|
if result.returncode != 0:
|
|
print(f" [SQL ERROR] {result.stderr[:200]}")
|
|
return result
|
|
|
|
|
|
def escape_sql(s):
|
|
return s.replace("\\", "\\\\").replace("'", "\\'").replace('"', '\\"')
|
|
|
|
|
|
def main():
|
|
print("=" * 60)
|
|
print("Fast Metadata Insert - Direct MySQL")
|
|
print(f"File fields: {len(FILE_FIELDS)}, TF fields: {len(TF_FIELDS)}")
|
|
print(f"Doel: {NUM_TEAMFOLDERS} folders x {FILES_PER_FOLDER} files x {len(FILE_FIELDS)} fields")
|
|
print(f" = {NUM_TEAMFOLDERS * FILES_PER_FOLDER * len(FILE_FIELDS):,} file metadata rows")
|
|
print(f" + {NUM_TEAMFOLDERS * len(TF_FIELDS):,} groupfolder metadata rows")
|
|
print("=" * 60)
|
|
|
|
start_time = time.time()
|
|
|
|
# Step 1: Get groupfolder mapping
|
|
print("\nStap 1: Ophalen groupfolder mapping...")
|
|
rows = mysql_exec("""
|
|
SELECT folder_id, mount_point FROM oc_group_folders
|
|
WHERE mount_point LIKE '{{ teamfolder_prefix }}-%'
|
|
ORDER BY folder_id
|
|
""")
|
|
gf_map = {}
|
|
for line in rows.split('\n'):
|
|
if line.strip():
|
|
parts = line.split('\t')
|
|
gf_id = int(parts[0])
|
|
tf_idx = int(parts[1].split('-')[-1])
|
|
gf_map[tf_idx] = gf_id
|
|
print(f" {len(gf_map)} groupfolders gevonden")
|
|
|
|
# Step 2: Get storage mapping
|
|
print("\nStap 2: Ophalen storage mapping...")
|
|
rows = mysql_exec("""
|
|
SELECT s.numeric_id,
|
|
REPLACE(REPLACE(s.id, 'local::/var/www/nextcloud/data/__groupfolders/', ''), '/', '') as folder_num
|
|
FROM oc_storages s
|
|
WHERE s.id LIKE 'local::%/__groupfolders/%'
|
|
""")
|
|
storage_map = {}
|
|
for line in rows.split('\n'):
|
|
if line.strip():
|
|
parts = line.split('\t')
|
|
storage_map[int(parts[1])] = int(parts[0])
|
|
|
|
# Step 3: Check which folders are already complete
|
|
print("\nStap 3: Checken welke folders al klaar zijn...")
|
|
done_rows = mysql_exec("""
|
|
SELECT groupfolder_id, COUNT(*) as cnt
|
|
FROM oc_metavox_file_gf_meta
|
|
GROUP BY groupfolder_id
|
|
""")
|
|
done_gf_ids = set()
|
|
target_per_folder = FILES_PER_FOLDER * len(FILE_FIELDS)
|
|
for line in (done_rows or "").split('\n'):
|
|
if line.strip():
|
|
parts = line.split('\t')
|
|
gf_id = int(parts[0])
|
|
cnt = int(parts[1])
|
|
if cnt >= target_per_folder * 0.1:
|
|
done_gf_ids.add(gf_id)
|
|
print(f" gf_id={gf_id}: {cnt}/{target_per_folder} ({cnt*100//target_per_folder}%) - SKIP")
|
|
print(f" {len(done_gf_ids)} folders al (grotendeels) compleet, worden overgeslagen")
|
|
|
|
# Step 4: Insert groupfolder metadata
|
|
print("\nStap 4: Groupfolder metadata invoegen...")
|
|
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
|
values = []
|
|
for tf_idx in range(1, NUM_TEAMFOLDERS + 1):
|
|
gf_id = gf_map.get(tf_idx)
|
|
if not gf_id:
|
|
continue
|
|
for fname, ftype in TF_FIELDS:
|
|
val = escape_sql(gen_value(fname, ftype))
|
|
values.append(f"({gf_id}, '{fname}', '{val}', '{now}', '{now}')")
|
|
|
|
sql_file = "/tmp/gf_metadata_insert.sql"
|
|
with open(sql_file, 'w') as f:
|
|
f.write("INSERT INTO oc_metavox_gf_metadata (groupfolder_id, field_name, field_value, created_at, updated_at) VALUES\n")
|
|
f.write(",\n".join(values))
|
|
f.write(";\n")
|
|
mysql_exec_file(sql_file)
|
|
print(f" {len(values)} groupfolder metadata records ingevoegd")
|
|
|
|
# Step 5: Insert file metadata per folder
|
|
print("\nStap 5: File metadata invoegen...")
|
|
total_inserted = 0
|
|
|
|
for tf_idx in range(1, NUM_TEAMFOLDERS + 1):
|
|
gf_id = gf_map.get(tf_idx)
|
|
if not gf_id:
|
|
continue
|
|
if gf_id in done_gf_ids:
|
|
total_inserted += FILES_PER_FOLDER * len(FILE_FIELDS)
|
|
continue
|
|
storage_id = storage_map.get(gf_id)
|
|
if not storage_id:
|
|
print(f" [SKIP] Geen storage voor gf_id {gf_id}")
|
|
continue
|
|
|
|
# Get file IDs for this folder
|
|
file_ids_raw = mysql_exec(f"""
|
|
SELECT fileid FROM oc_filecache
|
|
WHERE storage = {storage_id} AND name LIKE '{{ dummy_file_prefix }}%'
|
|
""")
|
|
file_ids = [int(x) for x in file_ids_raw.split('\n') if x.strip()]
|
|
|
|
if not file_ids:
|
|
print(f" [SKIP] Folder {tf_idx}: geen bestanden")
|
|
continue
|
|
|
|
# Generate all rows for this folder
|
|
values = []
|
|
for fid in file_ids:
|
|
for fname, ftype in FILE_FIELDS:
|
|
val = escape_sql(gen_value(fname, ftype))
|
|
values.append(f"({fid}, {gf_id}, '{fname}', '{val}', '{now}', '{now}')")
|
|
|
|
# Write in batches
|
|
for batch_start in range(0, len(values), SQL_BATCH_SIZE):
|
|
batch = values[batch_start:batch_start + SQL_BATCH_SIZE]
|
|
sql_file = "/tmp/file_meta_batch.sql"
|
|
with open(sql_file, 'w') as f:
|
|
f.write("INSERT INTO oc_metavox_file_gf_meta (file_id, groupfolder_id, field_name, field_value, created_at, updated_at) VALUES\n")
|
|
f.write(",\n".join(batch))
|
|
f.write(";\n")
|
|
mysql_exec_file(sql_file)
|
|
|
|
total_inserted += len(values)
|
|
elapsed = time.time() - start_time
|
|
rate = total_inserted / elapsed if elapsed > 0 else 0
|
|
print(f" [DONE] Folder {tf_idx} (gf_id={gf_id}): {len(file_ids)} files x {len(FILE_FIELDS)} fields = {len(values)} rows ({elapsed:.0f}s, totaal: {total_inserted:,}, {rate:.0f}/s)")
|
|
|
|
elapsed = time.time() - start_time
|
|
print(f"\n{'='*60}")
|
|
print(f"Voltooid in {elapsed:.0f} seconden ({elapsed/3600:.1f} uur)")
|
|
print(f"File metadata records: {total_inserted:,}")
|
|
print(f"GF metadata records: {len(gf_map) * len(TF_FIELDS)}")
|
|
print(f"{'='*60}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|