Initial commit: MetaVox loadtest playbook
Ansible playbook voor het opzetten van een MetaVox loadtest omgeving: - 50 teamfolders met 10.000 bestanden elk (500K totaal) - 100 metadata velddefinities (10 teamfolder + 90 file-level) - 3-niveau mappenstructuur (10 hoofdmappen x 3 submappen) - ~43M metadata records via directe MySQL inserts - Geoptimaliseerde database indexes (7 redundante indexes gedropt) Gebruikt directe filesystem writes en MySQL inserts i.p.v. WebDAV/API voor maximale performance. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
247
templates/fast_metadata_insert.py.j2
Normal file
247
templates/fast_metadata_insert.py.j2
Normal file
@@ -0,0 +1,247 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fast Metadata Insert - Direct MySQL
|
||||
=====================================
|
||||
Voegt metadata records in voor alle bestanden via directe MySQL inserts.
|
||||
Veel sneller dan de MetaVox API (~43M records in ~2 uur).
|
||||
|
||||
Tabellen:
|
||||
- oc_metavox_file_gf_meta: per-bestand metadata (file_id, groupfolder_id, field_name, field_value)
|
||||
- oc_metavox_gf_metadata: per-teamfolder metadata (groupfolder_id, field_name, field_value)
|
||||
"""
|
||||
import subprocess
|
||||
import time
|
||||
import random
|
||||
import string
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
DB_NAME = "{{ nextcloud_db_name }}"
|
||||
NUM_TEAMFOLDERS = {{ num_teamfolders }}
|
||||
FILES_PER_FOLDER = {{ files_per_teamfolder }}
|
||||
SQL_BATCH_SIZE = {{ sql_batch_size }}
|
||||
|
||||
NAMES = ["Jan", "Piet", "Klaas", "Marie", "Anna", "Sophie", "Thomas", "Eva",
|
||||
"Lucas", "Emma", "Liam", "Olivia", "Noah", "Mia", "Daan", "Sara"]
|
||||
ORGS = ["UvA", "HvA", "VU", "TU Delft", "Gemeente Amsterdam", "Ministerie BZK",
|
||||
"Rijkswaterstaat", "UWV", "DUO", "KNAW"]
|
||||
CITIES = ["Amsterdam", "Rotterdam", "Den Haag", "Utrecht", "Eindhoven",
|
||||
"Groningen", "Tilburg", "Almere", "Breda", "Nijmegen"]
|
||||
WORDS = ["beleid", "rapport", "analyse", "voorstel", "evaluatie", "plan",
|
||||
"nota", "brief", "contract", "factuur", "verslag", "advies"]
|
||||
|
||||
FILE_FIELDS = [
|
||||
{% for field in file_metadata_fields %}
|
||||
("{{ field.name }}", "{{ field.type }}{% if field.options is defined %}:{{ field.options }}{% endif %}"),
|
||||
{% endfor %}
|
||||
]
|
||||
|
||||
TF_FIELDS = [
|
||||
{% for field in teamfolder_metadata_fields %}
|
||||
("{{ field.name }}", "{{ field.type }}{% if field.options is defined %}:{{ field.options }}{% endif %}"),
|
||||
{% endfor %}
|
||||
]
|
||||
|
||||
|
||||
def random_date():
|
||||
start = datetime(2020, 1, 1)
|
||||
delta = (datetime(2026, 12, 31) - start).days
|
||||
return (start + timedelta(days=random.randint(0, delta))).strftime("%Y-%m-%d")
|
||||
|
||||
|
||||
def gen_value(fname, ftype):
|
||||
if ftype.startswith("dropdown:"):
|
||||
options = ftype.split(":", 1)[1].split(",")
|
||||
return random.choice(options)
|
||||
elif ftype == "date":
|
||||
return random_date()
|
||||
elif ftype == "checkbox":
|
||||
return random.choice(["true", "false"])
|
||||
else: # text
|
||||
if "email" in fname:
|
||||
return f"{random.choice(NAMES).lower()}@example.nl"
|
||||
elif "phone" in fname:
|
||||
return f"+31 6 {random.randint(10000000, 99999999)}"
|
||||
elif "postal" in fname:
|
||||
return f"{random.randint(1000, 9999)} {''.join(random.choices(string.ascii_uppercase, k=2))}"
|
||||
elif "city" in fname:
|
||||
return random.choice(CITIES)
|
||||
elif "country" in fname:
|
||||
return "Nederland"
|
||||
elif "organization" in fname:
|
||||
return random.choice(ORGS)
|
||||
elif any(w in fname for w in ["author", "creator", "contact", "owner"]):
|
||||
return f"{random.choice(NAMES)} {random.choice(['de Vries', 'Jansen', 'Bakker', 'Visser'])}"
|
||||
elif any(w in fname for w in ["number", "code", "identifier", "reference"]):
|
||||
return f"{fname[:3].upper()}-{random.randint(10000, 99999)}"
|
||||
elif "version" in fname:
|
||||
return f"{random.randint(1, 10)}.{random.randint(0, 9)}"
|
||||
elif "language" in fname:
|
||||
return random.choice(["Nederlands", "Engels", "Duits"])
|
||||
else:
|
||||
return " ".join(random.choices(WORDS, k=random.randint(1, 3))).capitalize()
|
||||
|
||||
|
||||
def mysql_exec(sql):
|
||||
result = subprocess.run(
|
||||
["mysql", "-u", "root", DB_NAME, "-N", "-e", sql],
|
||||
capture_output=True, text=True, timeout=120
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f" [SQL ERROR] {result.stderr[:200]}")
|
||||
return result.stdout.strip()
|
||||
|
||||
|
||||
def mysql_exec_file(filepath):
|
||||
with open(filepath) as f:
|
||||
result = subprocess.run(
|
||||
["mysql", "-u", "root", DB_NAME, "--max-allowed-packet=64M"],
|
||||
stdin=f, capture_output=True, text=True, timeout=300
|
||||
)
|
||||
if result.returncode != 0:
|
||||
print(f" [SQL ERROR] {result.stderr[:200]}")
|
||||
return result
|
||||
|
||||
|
||||
def escape_sql(s):
|
||||
return s.replace("\\", "\\\\").replace("'", "\\'").replace('"', '\\"')
|
||||
|
||||
|
||||
def main():
|
||||
print("=" * 60)
|
||||
print("Fast Metadata Insert - Direct MySQL")
|
||||
print(f"File fields: {len(FILE_FIELDS)}, TF fields: {len(TF_FIELDS)}")
|
||||
print(f"Doel: {NUM_TEAMFOLDERS} folders x {FILES_PER_FOLDER} files x {len(FILE_FIELDS)} fields")
|
||||
print(f" = {NUM_TEAMFOLDERS * FILES_PER_FOLDER * len(FILE_FIELDS):,} file metadata rows")
|
||||
print(f" + {NUM_TEAMFOLDERS * len(TF_FIELDS):,} groupfolder metadata rows")
|
||||
print("=" * 60)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
# Step 1: Get groupfolder mapping
|
||||
print("\nStap 1: Ophalen groupfolder mapping...")
|
||||
rows = mysql_exec("""
|
||||
SELECT folder_id, mount_point FROM oc_group_folders
|
||||
WHERE mount_point LIKE '{{ teamfolder_prefix }}-%'
|
||||
ORDER BY folder_id
|
||||
""")
|
||||
gf_map = {}
|
||||
for line in rows.split('\n'):
|
||||
if line.strip():
|
||||
parts = line.split('\t')
|
||||
gf_id = int(parts[0])
|
||||
tf_idx = int(parts[1].split('-')[-1])
|
||||
gf_map[tf_idx] = gf_id
|
||||
print(f" {len(gf_map)} groupfolders gevonden")
|
||||
|
||||
# Step 2: Get storage mapping
|
||||
print("\nStap 2: Ophalen storage mapping...")
|
||||
rows = mysql_exec("""
|
||||
SELECT s.numeric_id,
|
||||
REPLACE(REPLACE(s.id, 'local::/var/www/nextcloud/data/__groupfolders/', ''), '/', '') as folder_num
|
||||
FROM oc_storages s
|
||||
WHERE s.id LIKE 'local::%/__groupfolders/%'
|
||||
""")
|
||||
storage_map = {}
|
||||
for line in rows.split('\n'):
|
||||
if line.strip():
|
||||
parts = line.split('\t')
|
||||
storage_map[int(parts[1])] = int(parts[0])
|
||||
|
||||
# Step 3: Check which folders are already complete
|
||||
print("\nStap 3: Checken welke folders al klaar zijn...")
|
||||
done_rows = mysql_exec("""
|
||||
SELECT groupfolder_id, COUNT(*) as cnt
|
||||
FROM oc_metavox_file_gf_meta
|
||||
GROUP BY groupfolder_id
|
||||
""")
|
||||
done_gf_ids = set()
|
||||
target_per_folder = FILES_PER_FOLDER * len(FILE_FIELDS)
|
||||
for line in (done_rows or "").split('\n'):
|
||||
if line.strip():
|
||||
parts = line.split('\t')
|
||||
gf_id = int(parts[0])
|
||||
cnt = int(parts[1])
|
||||
if cnt >= target_per_folder * 0.1:
|
||||
done_gf_ids.add(gf_id)
|
||||
print(f" gf_id={gf_id}: {cnt}/{target_per_folder} ({cnt*100//target_per_folder}%) - SKIP")
|
||||
print(f" {len(done_gf_ids)} folders al (grotendeels) compleet, worden overgeslagen")
|
||||
|
||||
# Step 4: Insert groupfolder metadata
|
||||
print("\nStap 4: Groupfolder metadata invoegen...")
|
||||
now = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||
values = []
|
||||
for tf_idx in range(1, NUM_TEAMFOLDERS + 1):
|
||||
gf_id = gf_map.get(tf_idx)
|
||||
if not gf_id:
|
||||
continue
|
||||
for fname, ftype in TF_FIELDS:
|
||||
val = escape_sql(gen_value(fname, ftype))
|
||||
values.append(f"({gf_id}, '{fname}', '{val}', '{now}', '{now}')")
|
||||
|
||||
sql_file = "/tmp/gf_metadata_insert.sql"
|
||||
with open(sql_file, 'w') as f:
|
||||
f.write("INSERT INTO oc_metavox_gf_metadata (groupfolder_id, field_name, field_value, created_at, updated_at) VALUES\n")
|
||||
f.write(",\n".join(values))
|
||||
f.write(";\n")
|
||||
mysql_exec_file(sql_file)
|
||||
print(f" {len(values)} groupfolder metadata records ingevoegd")
|
||||
|
||||
# Step 5: Insert file metadata per folder
|
||||
print("\nStap 5: File metadata invoegen...")
|
||||
total_inserted = 0
|
||||
|
||||
for tf_idx in range(1, NUM_TEAMFOLDERS + 1):
|
||||
gf_id = gf_map.get(tf_idx)
|
||||
if not gf_id:
|
||||
continue
|
||||
if gf_id in done_gf_ids:
|
||||
total_inserted += FILES_PER_FOLDER * len(FILE_FIELDS)
|
||||
continue
|
||||
storage_id = storage_map.get(gf_id)
|
||||
if not storage_id:
|
||||
print(f" [SKIP] Geen storage voor gf_id {gf_id}")
|
||||
continue
|
||||
|
||||
# Get file IDs for this folder
|
||||
file_ids_raw = mysql_exec(f"""
|
||||
SELECT fileid FROM oc_filecache
|
||||
WHERE storage = {storage_id} AND name LIKE '{{ dummy_file_prefix }}%'
|
||||
""")
|
||||
file_ids = [int(x) for x in file_ids_raw.split('\n') if x.strip()]
|
||||
|
||||
if not file_ids:
|
||||
print(f" [SKIP] Folder {tf_idx}: geen bestanden")
|
||||
continue
|
||||
|
||||
# Generate all rows for this folder
|
||||
values = []
|
||||
for fid in file_ids:
|
||||
for fname, ftype in FILE_FIELDS:
|
||||
val = escape_sql(gen_value(fname, ftype))
|
||||
values.append(f"({fid}, {gf_id}, '{fname}', '{val}', '{now}', '{now}')")
|
||||
|
||||
# Write in batches
|
||||
for batch_start in range(0, len(values), SQL_BATCH_SIZE):
|
||||
batch = values[batch_start:batch_start + SQL_BATCH_SIZE]
|
||||
sql_file = "/tmp/file_meta_batch.sql"
|
||||
with open(sql_file, 'w') as f:
|
||||
f.write("INSERT INTO oc_metavox_file_gf_meta (file_id, groupfolder_id, field_name, field_value, created_at, updated_at) VALUES\n")
|
||||
f.write(",\n".join(batch))
|
||||
f.write(";\n")
|
||||
mysql_exec_file(sql_file)
|
||||
|
||||
total_inserted += len(values)
|
||||
elapsed = time.time() - start_time
|
||||
rate = total_inserted / elapsed if elapsed > 0 else 0
|
||||
print(f" [DONE] Folder {tf_idx} (gf_id={gf_id}): {len(file_ids)} files x {len(FILE_FIELDS)} fields = {len(values)} rows ({elapsed:.0f}s, totaal: {total_inserted:,}, {rate:.0f}/s)")
|
||||
|
||||
elapsed = time.time() - start_time
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Voltooid in {elapsed:.0f} seconden ({elapsed/3600:.1f} uur)")
|
||||
print(f"File metadata records: {total_inserted:,}")
|
||||
print(f"GF metadata records: {len(gf_map) * len(TF_FIELDS)}")
|
||||
print(f"{'='*60}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user