commit a1eda430c81bf8bad974c2edce58548e1fd76ca2 Author: Sam428-png Date: Mon Mar 16 16:55:30 2026 +0100 Initial commit: MetaVox loadtest playbook Ansible playbook voor het opzetten van een MetaVox loadtest omgeving: - 50 teamfolders met 10.000 bestanden elk (500K totaal) - 100 metadata velddefinities (10 teamfolder + 90 file-level) - 3-niveau mappenstructuur (10 hoofdmappen x 3 submappen) - ~43M metadata records via directe MySQL inserts - Geoptimaliseerde database indexes (7 redundante indexes gedropt) Gebruikt directe filesystem writes en MySQL inserts i.p.v. WebDAV/API voor maximale performance. Co-Authored-By: Claude Opus 4.6 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a0c0217 --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +*.retry +*.pyc +__pycache__/ +.vagrant/ diff --git a/ansible.cfg b/ansible.cfg new file mode 100644 index 0000000..3ce899c --- /dev/null +++ b/ansible.cfg @@ -0,0 +1,9 @@ +[defaults] +inventory = inventory/hosts.yml +roles_path = roles +stdout_callback = yaml +bin_ansible_callbacks = true +retry_files_enabled = false + +[ssh_connection] +pipelining = true diff --git a/cleanup.yml b/cleanup.yml new file mode 100644 index 0000000..83ba4b8 --- /dev/null +++ b/cleanup.yml @@ -0,0 +1,127 @@ +--- +# ============================================================================= +# MetaVox Load Test - Cleanup Playbook +# ============================================================================= +# Verwijdert alle teamfolders, metadata-velden, metadata-records en bestanden +# die door de load test zijn aangemaakt. +# +# Gebruik: ansible-playbook -i inventory/hosts.yml cleanup.yml +# ============================================================================= + +- name: MetaVox Load Test - Opruimen + hosts: nextcloud_server + gather_facts: false + vars: + nc_api_base: "{{ nextcloud_url }}/ocs/v2.php" + nc_index_base: "{{ nextcloud_url }}/index.php" + nc_auth_header: "Basic {{ (nextcloud_admin_user + ':' + nextcloud_admin_password) | b64encode }}" + common_headers: + OCS-APIRequest: "true" + Accept: "application/json" + Authorization: "{{ nc_auth_header }}" + + tasks: + # ========================================================================= + # Stap 1: Verwijder metadata records uit database + # ========================================================================= + - name: "Cleanup: Verwijder file metadata records" + ansible.builtin.command: + cmd: mysql -u root {{ nextcloud_db_name }} -e "TRUNCATE TABLE oc_metavox_file_gf_meta;" + ignore_errors: true + + - name: "Cleanup: Verwijder groupfolder metadata records" + ansible.builtin.command: + cmd: mysql -u root {{ nextcloud_db_name }} -e "TRUNCATE TABLE oc_metavox_gf_metadata;" + ignore_errors: true + + - name: "Cleanup: Verwijder field assignments" + ansible.builtin.command: + cmd: mysql -u root {{ nextcloud_db_name }} -e "DELETE FROM oc_metavox_gf_assigns WHERE groupfolder_id IN (SELECT folder_id FROM oc_group_folders WHERE mount_point LIKE '{{ teamfolder_prefix }}-%');" + ignore_errors: true + + # ========================================================================= + # Stap 2: Verwijder teamfolders via API + # ========================================================================= + - name: "Cleanup: Haal lijst van bestaande teamfolders op" + ansible.builtin.uri: + url: "{{ nc_index_base }}/apps/groupfolders/folders" + method: GET + headers: "{{ common_headers }}" + return_content: true + timeout: "{{ http_timeout }}" + register: existing_folders + + - name: "Cleanup: Filter load-test teamfolders" + ansible.builtin.set_fact: + loadtest_folder_ids: >- + {{ + existing_folders.json.ocs.data | dict2items + | selectattr('value.mount_point', 'match', teamfolder_prefix ~ '-.*') + | map(attribute='key') + | list + }} + when: existing_folders.json.ocs.data is defined + + - name: "Cleanup: Verwijder {{ loadtest_folder_ids | default([]) | length }} teamfolders" + ansible.builtin.uri: + url: "{{ nc_index_base }}/apps/groupfolders/folders/{{ item }}" + method: DELETE + headers: "{{ common_headers }}" + status_code: [200, 404] + timeout: "{{ http_timeout }}" + loop: "{{ loadtest_folder_ids | default([]) }}" + loop_control: + label: "Folder ID {{ item }}" + pause: 0.2 + + # ========================================================================= + # Stap 3: Verwijder metadata velddefinities + # ========================================================================= + - name: "Cleanup: Verwijder metadata-velden via script" + ansible.builtin.template: + src: templates/cleanup_metadata.py.j2 + dest: /tmp/metavox_cleanup_metadata.py + mode: '0755' + + - name: "Cleanup: Draai metadata cleanup" + ansible.builtin.command: + cmd: python3 /tmp/metavox_cleanup_metadata.py + register: cleanup_result + changed_when: "'deleted' in cleanup_result.stdout" + + # ========================================================================= + # Stap 4: Verwijder groep en tijdelijke bestanden + # ========================================================================= + - name: "Cleanup: Verwijder loadtest groep" + ansible.builtin.uri: + url: "{{ nc_api_base }}/cloud/groups/{{ loadtest_group }}" + method: DELETE + headers: "{{ common_headers }}" + status_code: [200, 404] + timeout: "{{ http_timeout }}" + when: create_group | default(true) + + - name: "Cleanup: Verwijder tijdelijke scripts" + ansible.builtin.file: + path: "{{ item }}" + state: absent + loop: + - /tmp/metavox_setup_metadata_fields.py + - /tmp/metavox_fast_create_files.sh + - /tmp/metavox_fast_db_register.py + - /tmp/metavox_create_folder_structure.py + - /tmp/metavox_fast_metadata_insert.py + - /tmp/metavox_cleanup_metadata.py + - /tmp/filecache_batch.sql + - /tmp/dir_insert.sql + - /tmp/gf_metadata_insert.sql + - /tmp/file_meta_batch.sql + + - name: "Cleanup: Voltooid" + ansible.builtin.debug: + msg: | + Cleanup voltooid! + - {{ loadtest_folder_ids | default([]) | length }} teamfolders verwijderd + - Metadata records opgeruimd (TRUNCATE) + - Metadata-velden opgeruimd + - Groep '{{ loadtest_group }}' verwijderd diff --git a/group_vars/all.yml b/group_vars/all.yml new file mode 100644 index 0000000..fbca7ff --- /dev/null +++ b/group_vars/all.yml @@ -0,0 +1,192 @@ +# ============================================================================= +# Nextcloud Server Configuratie +# ============================================================================= +nextcloud_url: "https://seedmv.researchdrivede.src.surf-hosted.nl" +nextcloud_admin_user: "admin" +nextcloud_admin_password: "secureadminpass" +nextcloud_db_name: "nextcloud" +nextcloud_data_dir: "/var/www/nextcloud/data" +groupfolders_dir: "{{ nextcloud_data_dir }}/__groupfolders" + +# ============================================================================= +# Groep die toegang krijgt tot de teamfolders +# ============================================================================= +loadtest_group: "loadtest-group" +create_group: true + +# ============================================================================= +# Team Folders Configuratie +# ============================================================================= +num_teamfolders: 50 +teamfolder_prefix: "LoadTest-TF" +teamfolder_permissions: 31 + +# ============================================================================= +# Mappenstructuur per teamfolder +# ============================================================================= +# 10 hoofdmappen x 3 submappen = 30 leaf folders per teamfolder +folder_structure: + Financieel: ["Facturen", "Contracten", "Begrotingen"] + HR: ["Personeelsdossiers", "Sollicitaties", "Verlof"] + IT: ["Projecten", "Infrastructuur", "Security"] + Juridisch: ["Contracten", "Compliance", "Dossiers"] + Marketing: ["Campagnes", "Materiaal", "Analyses"] + Operations: ["Logistiek", "Inkoop", "Planning"] + Management: ["Notulen", "Strategie", "Rapportages"] + Onderzoek: ["Publicaties", "Data", "Experimenten"] + Communicatie: ["Intern", "Extern", "Persberichten"] + Archief: ["2023", "2024", "2025"] + +# ============================================================================= +# MetaVox Metadata Velden Configuratie +# ============================================================================= + +# 10 Teamfolder-velden (metadata die op teamfolder-niveau wordt ingesteld) +teamfolder_metadata_fields: + - { name: "tf_department", type: "text", description: "Afdeling" } + - { name: "tf_project_code", type: "text", description: "Projectcode" } + - { name: "tf_classification", type: "dropdown", description: "Classificatie", options: "Openbaar,Intern,Vertrouwelijk,Geheim" } + - { name: "tf_owner", type: "text", description: "Eigenaar" } + - { name: "tf_status", type: "dropdown", description: "Status", options: "Actief,Archief,Concept,Afgesloten" } + - { name: "tf_start_date", type: "date", description: "Startdatum" } + - { name: "tf_end_date", type: "date", description: "Einddatum" } + - { name: "tf_budget_code", type: "text", description: "Budgetcode" } + - { name: "tf_is_confidential", type: "checkbox", description: "Vertrouwelijk" } + - { name: "tf_location", type: "text", description: "Locatie" } + +# 90 File-metadata-velden (metadata die per bestand wordt ingesteld) +file_metadata_fields: + # Text velden (30 stuks) + - { name: "doc_title", type: "text", description: "Documenttitel" } + - { name: "doc_author", type: "text", description: "Auteur" } + - { name: "doc_subject", type: "text", description: "Onderwerp" } + - { name: "doc_keywords", type: "text", description: "Trefwoorden" } + - { name: "doc_source", type: "text", description: "Bron" } + - { name: "doc_language", type: "text", description: "Taal" } + - { name: "doc_version", type: "text", description: "Versie" } + - { name: "doc_reference", type: "text", description: "Referentie" } + - { name: "doc_creator", type: "text", description: "Maker" } + - { name: "doc_contributor", type: "text", description: "Bijdrager" } + - { name: "doc_publisher", type: "text", description: "Uitgever" } + - { name: "doc_rights", type: "text", description: "Rechten" } + - { name: "doc_identifier", type: "text", description: "Identifier" } + - { name: "doc_relation", type: "text", description: "Relatie" } + - { name: "doc_coverage", type: "text", description: "Dekking" } + - { name: "doc_abstract", type: "text", description: "Samenvatting" } + - { name: "doc_notes", type: "text", description: "Notities" } + - { name: "doc_contact", type: "text", description: "Contactpersoon" } + - { name: "doc_email", type: "text", description: "E-mailadres" } + - { name: "doc_phone", type: "text", description: "Telefoonnummer" } + - { name: "doc_address", type: "text", description: "Adres" } + - { name: "doc_city", type: "text", description: "Stad" } + - { name: "doc_country", type: "text", description: "Land" } + - { name: "doc_postal_code", type: "text", description: "Postcode" } + - { name: "doc_organization", type: "text", description: "Organisatie" } + - { name: "doc_unit", type: "text", description: "Afdeling" } + - { name: "doc_role", type: "text", description: "Rol" } + - { name: "doc_case_number", type: "text", description: "Zaaknummer" } + - { name: "doc_invoice_number", type: "text", description: "Factuurnummer" } + - { name: "doc_contract_number", type: "text", description: "Contractnummer" } + + # Dropdown velden (20 stuks) + - { name: "doc_type", type: "dropdown", description: "Documenttype", options: "Brief,Rapport,Notitie,Factuur,Contract,Offerte,Notulen,Beleidsstuk,Memo,Overig" } + - { name: "doc_status", type: "dropdown", description: "Documentstatus", options: "Concept,Review,Goedgekeurd,Definitief,Verlopen,Ingetrokken" } + - { name: "doc_priority", type: "dropdown", description: "Prioriteit", options: "Laag,Normaal,Hoog,Urgent,Kritiek" } + - { name: "doc_category", type: "dropdown", description: "Categorie", options: "Financieel,Juridisch,HR,IT,Marketing,Operations,R&D,Strategie,Compliance" } + - { name: "doc_sensitivity", type: "dropdown", description: "Gevoeligheid", options: "Openbaar,Intern,Vertrouwelijk,Strikt vertrouwelijk" } + - { name: "doc_retention", type: "dropdown", description: "Bewaartermijn", options: "1 jaar,3 jaar,5 jaar,7 jaar,10 jaar,Permanent" } + - { name: "doc_format", type: "dropdown", description: "Formaat", options: "PDF,Word,Excel,PowerPoint,Afbeelding,E-mail,Overig" } + - { name: "doc_review_status", type: "dropdown", description: "Reviewstatus", options: "Niet gereviewed,In review,Goedgekeurd,Afgekeurd" } + - { name: "doc_approval_level", type: "dropdown", description: "Goedkeuringsniveau", options: "Team,Management,Directie,Bestuur" } + - { name: "doc_lifecycle", type: "dropdown", description: "Levenscyclus", options: "Creatie,Gebruik,Archivering,Vernietiging" } + - { name: "doc_audience", type: "dropdown", description: "Doelgroep", options: "Intern,Extern,Bestuur,Partners,Klanten,Leveranciers" } + - { name: "doc_region", type: "dropdown", description: "Regio", options: "Noord,Oost,Zuid,West,Centraal,Internationaal" } + - { name: "doc_quarter", type: "dropdown", description: "Kwartaal", options: "Q1,Q2,Q3,Q4" } + - { name: "doc_fiscal_year", type: "dropdown", description: "Boekjaar", options: "2023,2024,2025,2026,2027" } + - { name: "doc_department", type: "dropdown", description: "Afdeling", options: "Finance,HR,IT,Legal,Marketing,Operations,R&D,Sales,Support" } + - { name: "doc_workflow_state", type: "dropdown", description: "Workflowstatus", options: "Nieuw,In behandeling,Wachtend,Afgerond,Geannuleerd" } + - { name: "doc_archive_reason", type: "dropdown", description: "Archiveringsreden", options: "Bewaartermijn,Afgesloten project,Wettelijke verplichting,Verzoek" } + - { name: "doc_access_level", type: "dropdown", description: "Toegangsniveau", options: "Iedereen,Team,Management,Beperkt" } + - { name: "doc_origin", type: "dropdown", description: "Herkomst", options: "Intern,Extern,Partner,Overheid,Klant" } + - { name: "doc_media_type", type: "dropdown", description: "Mediatype", options: "Tekst,Afbeelding,Audio,Video,Mixed" } + + # Date velden (20 stuks) + - { name: "doc_created_date", type: "date", description: "Aanmaakdatum" } + - { name: "doc_modified_date", type: "date", description: "Wijzigingsdatum" } + - { name: "doc_published_date", type: "date", description: "Publicatiedatum" } + - { name: "doc_expiry_date", type: "date", description: "Verloopdatum" } + - { name: "doc_review_date", type: "date", description: "Reviewdatum" } + - { name: "doc_approval_date", type: "date", description: "Goedkeuringsdatum" } + - { name: "doc_archive_date", type: "date", description: "Archiveringsdatum" } + - { name: "doc_effective_date", type: "date", description: "Ingangsdatum" } + - { name: "doc_received_date", type: "date", description: "Ontvangstdatum" } + - { name: "doc_sent_date", type: "date", description: "Verzenddatum" } + - { name: "doc_signed_date", type: "date", description: "Tekendatum" } + - { name: "doc_deadline", type: "date", description: "Deadline" } + - { name: "doc_meeting_date", type: "date", description: "Vergaderdatum" } + - { name: "doc_start_date", type: "date", description: "Startdatum" } + - { name: "doc_end_date", type: "date", description: "Einddatum" } + - { name: "doc_invoice_date", type: "date", description: "Factuurdatum" } + - { name: "doc_payment_date", type: "date", description: "Betaaldatum" } + - { name: "doc_due_date", type: "date", description: "Vervaldatum" } + - { name: "doc_birth_date", type: "date", description: "Geboortedatum" } + - { name: "doc_registration_date", type: "date", description: "Registratiedatum" } + + # Checkbox velden (20 stuks) + - { name: "doc_is_template", type: "checkbox", description: "Is template" } + - { name: "doc_is_signed", type: "checkbox", description: "Is getekend" } + - { name: "doc_is_approved", type: "checkbox", description: "Is goedgekeurd" } + - { name: "doc_is_archived", type: "checkbox", description: "Is gearchiveerd" } + - { name: "doc_is_public", type: "checkbox", description: "Is openbaar" } + - { name: "doc_is_confidential", type: "checkbox", description: "Is vertrouwelijk" } + - { name: "doc_is_final", type: "checkbox", description: "Is definitief" } + - { name: "doc_is_draft", type: "checkbox", description: "Is concept" } + - { name: "doc_needs_review", type: "checkbox", description: "Review nodig" } + - { name: "doc_needs_approval", type: "checkbox", description: "Goedkeuring nodig" } + - { name: "doc_has_attachments", type: "checkbox", description: "Heeft bijlagen" } + - { name: "doc_is_scanned", type: "checkbox", description: "Is gescand" } + - { name: "doc_is_ocr", type: "checkbox", description: "OCR verwerkt" } + - { name: "doc_is_encrypted", type: "checkbox", description: "Is versleuteld" } + - { name: "doc_is_compressed", type: "checkbox", description: "Is gecomprimeerd" } + - { name: "doc_is_original", type: "checkbox", description: "Is origineel" } + - { name: "doc_is_copy", type: "checkbox", description: "Is kopie" } + - { name: "doc_requires_action", type: "checkbox", description: "Actie vereist" } + - { name: "doc_is_billable", type: "checkbox", description: "Is factureerbaar" } + - { name: "doc_is_completed", type: "checkbox", description: "Is afgerond" } + +# ============================================================================= +# Dummy Bestanden Configuratie +# ============================================================================= +files_per_teamfolder: 10000 +dummy_file_content: "Dit is een dummy bestand voor MetaVox load testing. Aangemaakt door Ansible." +dummy_file_extension: "txt" +dummy_file_prefix: "loadtest-doc" + +# ============================================================================= +# Performance Configuratie +# ============================================================================= +sql_batch_size: 2000 +http_timeout: 300 +max_retries: 3 + +# ============================================================================= +# Database Index Optimalisatie +# ============================================================================= +# Bij grote hoeveelheden metadata records moeten overbodige indexes +# op oc_metavox_file_gf_meta gedropt worden voor performance en diskruimte. +# +# BEHOUDEN (4 indexes): +# - PRIMARY +# - mf_file_gf_meta_unique (file_id, groupfolder_id, field_name) +# - idx_file_gf_gf_lookup (groupfolder_id, field_name, field_value) +# - idx_gf_file_meta_filter (field_name, field_value, groupfolder_id) +# +# DROPPEN (7 redundante indexes): +drop_indexes: + - "idx_file_gf_composite" + - "mf_file_gf_meta_file" + - "mf_file_gf_meta_gf" + - "mf_file_gf_meta_field" + - "idx_gf_file_meta_file_id" + - "idx_gf_file_meta_timestamps" + - "idx_file_gf_updated" diff --git a/inventory/hosts.yml b/inventory/hosts.yml new file mode 100644 index 0000000..c4fd7e7 --- /dev/null +++ b/inventory/hosts.yml @@ -0,0 +1,6 @@ +--- +all: + hosts: + nextcloud_server: + ansible_connection: local + ansible_python_interpreter: /usr/bin/python3 diff --git a/site.yml b/site.yml new file mode 100644 index 0000000..7f49a18 --- /dev/null +++ b/site.yml @@ -0,0 +1,317 @@ +--- +# ============================================================================= +# MetaVox Load Test - Hoofd Playbook +# ============================================================================= +# Maakt teamfolders, metadata-velden, bestanden en metadata-records aan op een +# Nextcloud-instantie met MetaVox. +# +# Gebruikt directe filesystem writes en MySQL inserts i.p.v. WebDAV/API +# voor maximale snelheid (~500K bestanden + ~43M metadata records in < 2 uur). +# +# Gebruik: ansible-playbook -i inventory/hosts.yml site.yml +# Tags: precheck, teamfolders, metadata-fields, files, db-register, +# folder-structure, drop-indexes, metadata-records +# ============================================================================= + +- name: MetaVox Load Test - Setup en Data Generatie + hosts: nextcloud_server + gather_facts: true + vars: + nc_api_base: "{{ nextcloud_url }}/ocs/v2.php" + nc_dav_base: "{{ nextcloud_url }}/remote.php/dav" + nc_index_base: "{{ nextcloud_url }}/index.php" + nc_auth_header: "Basic {{ (nextcloud_admin_user + ':' + nextcloud_admin_password) | b64encode }}" + common_headers: + OCS-APIRequest: "true" + Accept: "application/json" + Authorization: "{{ nc_auth_header }}" + + tasks: + # ========================================================================= + # FASE 0: Pre-checks + # ========================================================================= + - name: "Pre-check: Controleer of Nextcloud bereikbaar is" + ansible.builtin.uri: + url: "{{ nextcloud_url }}/status.php" + method: GET + return_content: true + timeout: 30 + register: nc_status + failed_when: nc_status.status != 200 + tags: [precheck] + + - name: "Pre-check: Toon Nextcloud versie" + ansible.builtin.debug: + msg: "Nextcloud is bereikbaar. Versie: {{ (nc_status.content | from_json).versionstring }}" + tags: [precheck] + + - name: "Pre-check: Controleer beschikbare schijfruimte" + ansible.builtin.command: df -h /var/www/nextcloud/data + register: disk_check + changed_when: false + tags: [precheck] + + - name: "Pre-check: Toon schijfruimte" + ansible.builtin.debug: + msg: "{{ disk_check.stdout }}" + tags: [precheck] + + # ========================================================================= + # FASE 0.5: Maak groep aan (optioneel) + # ========================================================================= + - name: "Setup: Maak loadtest groep aan" + ansible.builtin.uri: + url: "{{ nc_api_base }}/cloud/groups" + method: POST + headers: "{{ common_headers }}" + body_format: form-urlencoded + body: + groupid: "{{ loadtest_group }}" + status_code: [200, 400] + timeout: "{{ http_timeout }}" + when: create_group | default(true) + tags: [setup, teamfolders] + + # ========================================================================= + # FASE 1: Teamfolders aanmaken via API + # ========================================================================= + - name: "Teamfolders: Genereer lijst van teamfolder namen" + ansible.builtin.set_fact: + teamfolder_names: "{{ teamfolder_names | default([]) + [teamfolder_prefix + '-%03d' | format(item)] }}" + loop: "{{ range(1, num_teamfolders + 1) | list }}" + tags: [teamfolders] + + - name: "Teamfolders: Maak {{ num_teamfolders }} teamfolders aan" + ansible.builtin.uri: + url: "{{ nc_index_base }}/apps/groupfolders/folders" + method: POST + headers: "{{ common_headers }}" + body_format: form-urlencoded + body: + mountpoint: "{{ item }}" + return_content: true + status_code: [200] + timeout: "{{ http_timeout }}" + loop: "{{ teamfolder_names }}" + register: teamfolder_results + loop_control: + label: "{{ item }}" + pause: 0.2 + tags: [teamfolders] + + - name: "Teamfolders: Verzamel folder IDs" + ansible.builtin.set_fact: + teamfolder_ids: "{{ teamfolder_results.results | map(attribute='json') | map(attribute='ocs') | map(attribute='data') | map(attribute='id') | list }}" + tags: [teamfolders] + when: teamfolder_results is defined + + - name: "Teamfolders: Ken groep '{{ loadtest_group }}' toe aan elke teamfolder" + ansible.builtin.uri: + url: "{{ nc_index_base }}/apps/groupfolders/folders/{{ item }}/groups" + method: POST + headers: "{{ common_headers }}" + body_format: form-urlencoded + body: + group: "{{ loadtest_group }}" + status_code: [200] + timeout: "{{ http_timeout }}" + loop: "{{ teamfolder_ids }}" + loop_control: + label: "Folder ID {{ item }}" + pause: 0.1 + tags: [teamfolders] + + - name: "Teamfolders: Stel permissies in voor de groep" + ansible.builtin.uri: + url: "{{ nc_index_base }}/apps/groupfolders/folders/{{ item }}/groups/{{ loadtest_group }}" + method: POST + headers: "{{ common_headers }}" + body_format: form-urlencoded + body: + permissions: "{{ teamfolder_permissions }}" + status_code: [200] + timeout: "{{ http_timeout }}" + loop: "{{ teamfolder_ids }}" + loop_control: + label: "Folder ID {{ item }}" + pause: 0.1 + tags: [teamfolders] + + - name: "Teamfolders: Resultaat" + ansible.builtin.debug: + msg: "{{ num_teamfolders }} teamfolders aangemaakt met IDs: {{ teamfolder_ids[:5] }}... (eerste 5 getoond)" + tags: [teamfolders] + + # ========================================================================= + # FASE 2: Metadata velden aanmaken (MetaVox OCS API) + # ========================================================================= + - name: "Metadata: Kopieer metadata-setup script" + ansible.builtin.template: + src: templates/setup_metadata_fields.py.j2 + dest: /tmp/metavox_setup_metadata_fields.py + mode: '0755' + tags: [metadata-fields] + + - name: "Metadata: Maak metadata velddefinities aan via MetaVox API" + ansible.builtin.command: + cmd: python3 /tmp/metavox_setup_metadata_fields.py + environment: + PYTHONUNBUFFERED: "1" + register: metadata_fields_result + changed_when: "'created' in metadata_fields_result.stdout" + tags: [metadata-fields] + + - name: "Metadata: Toon resultaat" + ansible.builtin.debug: + msg: "{{ metadata_fields_result.stdout_lines | default(['Geen output']) }}" + tags: [metadata-fields] + + # ========================================================================= + # FASE 3: Bestanden aanmaken (direct filesystem) + # ========================================================================= + - name: "Bestanden: Kopieer file-creatie script" + ansible.builtin.template: + src: templates/fast_create_files.sh.j2 + dest: /tmp/metavox_fast_create_files.sh + mode: '0755' + tags: [files] + + - name: "Bestanden: Maak {{ files_per_teamfolder * num_teamfolders }} bestanden aan op filesystem" + ansible.builtin.command: + cmd: bash /tmp/metavox_fast_create_files.sh + register: create_files_result + changed_when: true + async: 7200 + poll: 30 + tags: [files] + + - name: "Bestanden: Toon resultaat" + ansible.builtin.debug: + msg: "{{ create_files_result.stdout_lines[-10:] | default(['Geen output']) }}" + tags: [files] + + # ========================================================================= + # FASE 4: Bestanden registreren in database (direct MySQL) + # ========================================================================= + - name: "DB Register: Kopieer database registratie script" + ansible.builtin.template: + src: templates/fast_db_register.py.j2 + dest: /tmp/metavox_fast_db_register.py + mode: '0755' + tags: [db-register] + + - name: "DB Register: Registreer bestanden in oc_filecache" + ansible.builtin.command: + cmd: python3 /tmp/metavox_fast_db_register.py + environment: + PYTHONUNBUFFERED: "1" + register: db_register_result + changed_when: true + async: 7200 + poll: 30 + tags: [db-register] + + - name: "DB Register: Toon resultaat" + ansible.builtin.debug: + msg: "{{ db_register_result.stdout_lines[-10:] | default(['Geen output']) }}" + tags: [db-register] + + # ========================================================================= + # FASE 5: Mappenstructuur aanmaken en bestanden verplaatsen + # ========================================================================= + - name: "Mappen: Kopieer mappenstructuur script" + ansible.builtin.template: + src: templates/create_folder_structure.py.j2 + dest: /tmp/metavox_create_folder_structure.py + mode: '0755' + tags: [folder-structure] + + - name: "Mappen: Maak mappenstructuur aan en verplaats bestanden" + ansible.builtin.command: + cmd: python3 /tmp/metavox_create_folder_structure.py + environment: + PYTHONUNBUFFERED: "1" + register: folder_structure_result + changed_when: true + async: 7200 + poll: 30 + tags: [folder-structure] + + - name: "Mappen: Toon resultaat" + ansible.builtin.debug: + msg: "{{ folder_structure_result.stdout_lines[-10:] | default(['Geen output']) }}" + tags: [folder-structure] + + # ========================================================================= + # FASE 6: Drop overbodige indexes voor performance + # ========================================================================= + - name: "Indexes: Drop overbodige indexes op oc_metavox_file_gf_meta" + ansible.builtin.command: + cmd: > + mysql -u root {{ nextcloud_db_name }} -e + "DROP INDEX IF EXISTS {{ item }} ON oc_metavox_file_gf_meta;" + loop: "{{ drop_indexes }}" + loop_control: + label: "DROP INDEX {{ item }}" + ignore_errors: true + tags: [drop-indexes] + + # ========================================================================= + # FASE 7: Metadata records invoegen (direct MySQL) + # ========================================================================= + - name: "Metadata Records: Kopieer metadata insert script" + ansible.builtin.template: + src: templates/fast_metadata_insert.py.j2 + dest: /tmp/metavox_fast_metadata_insert.py + mode: '0755' + tags: [metadata-records] + + - name: "Metadata Records: Voeg metadata records in via MySQL" + ansible.builtin.command: + cmd: python3 /tmp/metavox_fast_metadata_insert.py + environment: + PYTHONUNBUFFERED: "1" + register: metadata_insert_result + changed_when: true + async: 86400 + poll: 60 + tags: [metadata-records] + + - name: "Metadata Records: Toon resultaat" + ansible.builtin.debug: + msg: "{{ metadata_insert_result.stdout_lines[-10:] | default(['Geen output']) }}" + tags: [metadata-records] + + # ========================================================================= + # FASE 8: Fix ownership + # ========================================================================= + - name: "Fix: Zet eigenaar op www-data voor groupfolders" + ansible.builtin.file: + path: "{{ groupfolders_dir }}" + owner: www-data + group: www-data + recurse: true + tags: [fix-ownership] + + # ========================================================================= + # SAMENVATTING + # ========================================================================= + - name: "Samenvatting" + ansible.builtin.debug: + msg: | + ============================================================ + MetaVox Load Test - Voltooid! + ============================================================ + Teamfolders aangemaakt: {{ num_teamfolders }} + Teamfolder metadata-velden: {{ teamfolder_metadata_fields | length }} + File metadata-velden: {{ file_metadata_fields | length }} + Totaal metadata-velden: {{ teamfolder_metadata_fields | length + file_metadata_fields | length }} + Bestanden per teamfolder: {{ files_per_teamfolder }} + Totaal bestanden: {{ files_per_teamfolder * num_teamfolders }} + Mappenstructuur: 10 hoofdmappen x 3 submappen + Metadata records: ~{{ files_per_teamfolder * num_teamfolders * (file_metadata_fields | length) }} + Gedropte indexes: {{ drop_indexes | length }} + ============================================================ + Methode: Direct filesystem + MySQL (geen WebDAV/API) + ============================================================ + tags: [always] diff --git a/templates/cleanup_metadata.py.j2 b/templates/cleanup_metadata.py.j2 new file mode 100644 index 0000000..fa4eaaf --- /dev/null +++ b/templates/cleanup_metadata.py.j2 @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +MetaVox Metadata Cleanup Script +Verwijdert metadata-velddefinities via de OCS API. +""" + +import json +import sys +import time +import requests +from requests.auth import HTTPBasicAuth + +NC_URL = "{{ nextcloud_url }}" +NC_USER = "{{ nextcloud_admin_user }}" +NC_PASS = "{{ nextcloud_admin_password }}" +TIMEOUT = {{ http_timeout }} + +WEB_API_BASE = f"{NC_URL}/index.php/apps/metavox/api" +OCS_BASE = f"{NC_URL}/ocs/v2.php/apps/metavox/api/v1" + +AUTH = HTTPBasicAuth(NC_USER, NC_PASS) +HEADERS = { + "OCS-APIRequest": "true", + "Accept": "application/json", +} + +TF_FIELDS = json.loads("""{{ teamfolder_metadata_fields | to_json }}""") +FILE_FIELDS = json.loads("""{{ file_metadata_fields | to_json }}""") + +FIELD_NAMES_TO_DELETE = set(f["name"] for f in TF_FIELDS + FILE_FIELDS) + + +def get_all_fields(): + url = f"{OCS_BASE}/groupfolder-fields" + try: + resp = requests.get(url, auth=AUTH, headers=HEADERS, timeout=TIMEOUT) + if resp.status_code == 200: + data = resp.json() + if "ocs" in data and "data" in data["ocs"]: + return data["ocs"]["data"] + except Exception as e: + print(f"[ERROR] Ophalen velden: {e}") + return [] + + +def delete_field(field_id): + url = f"{WEB_API_BASE}/groupfolder-fields/{field_id}" + try: + resp = requests.delete(url, auth=AUTH, headers=HEADERS, timeout=TIMEOUT) + return resp.status_code in [200, 204, 404] + except Exception as e: + print(f" [ERROR] Verwijderen veld {field_id}: {e}") + return False + + +def main(): + print("MetaVox Metadata Cleanup") + print("=" * 40) + + fields = get_all_fields() + print(f"Gevonden: {len(fields)} velden totaal") + + deleted = 0 + skipped = 0 + for field in fields: + fname = field.get("field_name", "") + fid = field.get("id") + if fname in FIELD_NAMES_TO_DELETE and fid: + if delete_field(fid): + deleted += 1 + else: + print(f" [FAIL] Kon veld '{fname}' (id={fid}) niet verwijderen") + else: + skipped += 1 + + print(f"\n{deleted} velden deleted, {skipped} overgeslagen") + + +if __name__ == "__main__": + main() diff --git a/templates/create_folder_structure.py.j2 b/templates/create_folder_structure.py.j2 new file mode 100644 index 0000000..8781995 --- /dev/null +++ b/templates/create_folder_structure.py.j2 @@ -0,0 +1,257 @@ +#!/usr/bin/env python3 +""" +Create Folder Structure - Mappenstructuur aanmaken en bestanden verplaatsen +=========================================================================== +Maakt een 3-niveau mappenstructuur aan binnen elke teamfolder en verplaatst +de bestanden gelijkmatig over de leaf folders. + +10 hoofdmappen x 3 submappen = 30 leaf folders per teamfolder. +Bestanden worden verplaatst met os.rename (geen extra diskruimte nodig). +Database (oc_filecache) wordt bijgewerkt met nieuwe paden en parent IDs. +""" +import os +import subprocess +import hashlib +import time + +DB_NAME = "{{ nextcloud_db_name }}" +GROUPFOLDERS_DIR = "{{ groupfolders_dir }}" +NUM_TEAMFOLDERS = {{ num_teamfolders }} +FILES_PER_FOLDER = {{ files_per_teamfolder }} + +FOLDER_STRUCTURE = {{ folder_structure | to_json }} + +# Build flat list of leaf paths +LEAF_FOLDERS = [] +for main, subs in FOLDER_STRUCTURE.items(): + for sub in subs: + LEAF_FOLDERS.append(f"{main}/{sub}") + +print(f"Folder structuur: {len(FOLDER_STRUCTURE)} hoofdmappen, {len(LEAF_FOLDERS)} submappen") + + +def mysql_exec(sql): + result = subprocess.run( + ["mysql", "-u", "root", DB_NAME, "-N", "-e", sql], + capture_output=True, text=True, timeout=120 + ) + if result.returncode != 0 and result.stderr.strip(): + print(f" [SQL ERROR] {result.stderr[:200]}") + return result.stdout.strip() + + +def mysql_exec_file(filepath): + with open(filepath) as f: + result = subprocess.run( + ["mysql", "-u", "root", DB_NAME, "--max-allowed-packet=64M"], + stdin=f, capture_output=True, text=True, timeout=300 + ) + if result.returncode != 0: + print(f" [SQL ERROR] {result.stderr[:200]}") + return result + + +def escape_sql(s): + return s.replace("\\", "\\\\").replace("'", "\\'") + + +def main(): + start_time = time.time() + now = int(time.time()) + + # Get storage mapping + print("Ophalen storage mapping...") + rows = mysql_exec(""" + SELECT s.numeric_id, + REPLACE(REPLACE(s.id, 'local::/var/www/nextcloud/data/__groupfolders/', ''), '/', '') as folder_num + FROM oc_storages s + WHERE s.id LIKE 'local::%/__groupfolders/%' + """) + storage_map = {} + for line in rows.split('\n'): + if line.strip(): + parts = line.split('\t') + storage_map[int(parts[1])] = int(parts[0]) + + # Get groupfolder mapping + rows = mysql_exec(""" + SELECT folder_id, mount_point FROM oc_group_folders + WHERE mount_point LIKE '{{ teamfolder_prefix }}-%' + ORDER BY folder_id + """) + gf_map = {} + for line in rows.split('\n'): + if line.strip(): + parts = line.split('\t') + gf_id = int(parts[0]) + tf_idx = int(parts[1].split('-')[-1]) + if tf_idx <= NUM_TEAMFOLDERS: + gf_map[tf_idx] = gf_id + + print(f"{len(gf_map)} teamfolders gevonden") + + # Get mimetype ID for directories + dir_mime = int(mysql_exec("SELECT id FROM oc_mimetypes WHERE mimetype='httpd/unix-directory'")) + + total_moved = 0 + + for tf_idx in sorted(gf_map.keys()): + gf_id = gf_map[tf_idx] + storage_id = storage_map.get(gf_id) + if not storage_id: + print(f" [SKIP] Geen storage voor gf_id {gf_id}") + continue + + folder_num = gf_id + files_dir = f"{GROUPFOLDERS_DIR}/{folder_num}/files" + + # Check if already restructured + first_sub = list(FOLDER_STRUCTURE.keys())[0] + if os.path.isdir(f"{files_dir}/{first_sub}"): + print(f" [SKIP] Folder {tf_idx} (gf_id={gf_id}): al gestructureerd") + total_moved += FILES_PER_FOLDER + continue + + # Get parent_id for 'files' directory + files_parent = mysql_exec(f"SELECT fileid FROM oc_filecache WHERE storage = {storage_id} AND path = 'files'") + if not files_parent: + print(f" [SKIP] Geen files entry voor storage {storage_id}") + continue + files_parent_id = int(files_parent) + + # Step 1: Create directories on disk + for main_folder, subs in FOLDER_STRUCTURE.items(): + os.makedirs(f"{files_dir}/{main_folder}", exist_ok=True) + for sub in subs: + os.makedirs(f"{files_dir}/{main_folder}/{sub}", exist_ok=True) + + # Step 2: Insert main directory entries into oc_filecache + dir_values = [] + for main_folder in FOLDER_STRUCTURE.keys(): + path = f"files/{main_folder}" + path_hash = hashlib.md5(path.encode()).hexdigest() + etag = hashlib.md5(f"{now}{path}".encode()).hexdigest() + dir_values.append( + f"({storage_id}, '{escape_sql(path)}', '{path_hash}', {files_parent_id}, " + f"'{escape_sql(main_folder)}', {dir_mime}, {dir_mime}, 0, {now}, {now}, '{etag}', 31, 0)" + ) + + sql_file = "/tmp/dir_insert.sql" + with open(sql_file, 'w') as f: + f.write("INSERT IGNORE INTO oc_filecache (storage, path, path_hash, parent, name, mimetype, mimepart, size, mtime, storage_mtime, etag, permissions, unencrypted_size) VALUES\n") + f.write(",\n".join(dir_values)) + f.write(";\n") + mysql_exec_file(sql_file) + + # Get main folder IDs + main_ids = {} + for main_folder in FOLDER_STRUCTURE.keys(): + path = f"files/{main_folder}" + fid = mysql_exec(f"SELECT fileid FROM oc_filecache WHERE storage = {storage_id} AND path = '{escape_sql(path)}'") + if fid: + main_ids[main_folder] = int(fid) + + # Insert sub folder entries + sub_values = [] + for main_folder, subs in FOLDER_STRUCTURE.items(): + parent_id = main_ids.get(main_folder, files_parent_id) + for sub in subs: + path = f"files/{main_folder}/{sub}" + path_hash = hashlib.md5(path.encode()).hexdigest() + etag = hashlib.md5(f"{now}{path}".encode()).hexdigest() + sub_values.append( + f"({storage_id}, '{escape_sql(path)}', '{path_hash}', {parent_id}, " + f"'{escape_sql(sub)}', {dir_mime}, {dir_mime}, 0, {now}, {now}, '{etag}', 31, 0)" + ) + + with open(sql_file, 'w') as f: + f.write("INSERT IGNORE INTO oc_filecache (storage, path, path_hash, parent, name, mimetype, mimepart, size, mtime, storage_mtime, etag, permissions, unencrypted_size) VALUES\n") + f.write(",\n".join(sub_values)) + f.write(";\n") + mysql_exec_file(sql_file) + + # Get sub folder IDs + sub_ids = {} + for main_folder, subs in FOLDER_STRUCTURE.items(): + for sub in subs: + path = f"files/{main_folder}/{sub}" + fid = mysql_exec(f"SELECT fileid FROM oc_filecache WHERE storage = {storage_id} AND path = '{escape_sql(path)}'") + if fid: + sub_ids[f"{main_folder}/{sub}"] = int(fid) + + # Step 3: Move files on disk and update DB + file_rows = mysql_exec(f""" + SELECT fileid, name FROM oc_filecache + WHERE storage = {storage_id} AND path LIKE 'files/{{ dummy_file_prefix }}%' + ORDER BY fileid + """) + files = [] + for line in file_rows.split('\n'): + if line.strip(): + parts = line.split('\t') + files.append((int(parts[0]), parts[1])) + + if not files: + print(f" [SKIP] Folder {tf_idx}: geen bestanden") + continue + + # Distribute files across leaf folders + files_per_leaf = len(files) // len(LEAF_FOLDERS) + remainder = len(files) % len(LEAF_FOLDERS) + + file_idx = 0 + for leaf_idx, leaf_path in enumerate(LEAF_FOLDERS): + leaf_parent_id = sub_ids.get(leaf_path, files_parent_id) + count = files_per_leaf + (1 if leaf_idx < remainder else 0) + + for _ in range(count): + if file_idx >= len(files): + break + fid, fname = files[file_idx] + old_path = f"{files_dir}/{fname}" + new_path_disk = f"{files_dir}/{leaf_path}/{fname}" + + if os.path.exists(old_path): + os.rename(old_path, new_path_disk) + file_idx += 1 + + # Batch update DB + BATCH = 2000 + for batch_start in range(0, len(files), BATCH): + updates = [] + for leaf_idx, leaf_path in enumerate(LEAF_FOLDERS): + leaf_parent_id = sub_ids.get(leaf_path, files_parent_id) + count = files_per_leaf + (1 if leaf_idx < remainder else 0) + leaf_start = sum(files_per_leaf + (1 if i < remainder else 0) for i in range(leaf_idx)) + leaf_end = leaf_start + count + + for i in range(max(leaf_start, batch_start), min(leaf_end, batch_start + BATCH)): + if i >= len(files): + break + fid, fname = files[i] + new_path_db = f"files/{leaf_path}/{fname}" + new_path_hash = hashlib.md5(new_path_db.encode()).hexdigest() + updates.append(f"UPDATE oc_filecache SET path='{escape_sql(new_path_db)}', path_hash='{new_path_hash}', parent={leaf_parent_id} WHERE fileid={fid};") + + if updates: + with open(sql_file, 'w') as f: + f.write("\n".join(updates)) + mysql_exec_file(sql_file) + + # Fix ownership + os.system(f"chown -R www-data:www-data {files_dir}") + + total_moved += len(files) + elapsed = time.time() - start_time + print(f" [DONE] Folder {tf_idx} (gf_id={gf_id}): {len(files)} bestanden verplaatst naar {len(LEAF_FOLDERS)} submappen ({elapsed:.0f}s, totaal: {total_moved})") + + elapsed = time.time() - start_time + print(f"\n{'='*60}") + print(f"Voltooid in {elapsed:.0f} seconden") + print(f"Totaal verplaatst: {total_moved}") + print(f"Structuur: {len(FOLDER_STRUCTURE)} hoofdmappen, {len(LEAF_FOLDERS)} submappen per teamfolder") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/templates/fast_create_files.sh.j2 b/templates/fast_create_files.sh.j2 new file mode 100644 index 0000000..7bdf9fb --- /dev/null +++ b/templates/fast_create_files.sh.j2 @@ -0,0 +1,73 @@ +#!/bin/bash +# ============================================================================= +# Fast File Creator - Direct filesystem +# ============================================================================= +# Maakt {{ files_per_teamfolder }} bestanden per teamfolder aan op het filesystem. +# Veel sneller dan WebDAV uploads (~500K bestanden in ~40 minuten). +# ============================================================================= + +set -euo pipefail + +GROUPFOLDERS_DIR="{{ groupfolders_dir }}" +NUM_TEAMFOLDERS={{ num_teamfolders }} +FILES_PER_FOLDER={{ files_per_teamfolder }} +FILE_PREFIX="{{ dummy_file_prefix }}" +FILE_EXT="{{ dummy_file_extension }}" +FILE_CONTENT="{{ dummy_file_content }}" + +echo "============================================================" +echo "Fast File Creator - Direct Filesystem" +echo "Doel: ${NUM_TEAMFOLDERS} folders x ${FILES_PER_FOLDER} bestanden" +echo "============================================================" + +# Haal groupfolder mapping op (API folder_id -> disk folder number) +GF_IDS=$(mysql -u root {{ nextcloud_db_name }} -N -e \ + "SELECT folder_id FROM oc_group_folders + WHERE mount_point LIKE '{{ teamfolder_prefix }}-%' + ORDER BY folder_id + LIMIT ${NUM_TEAMFOLDERS}") + +TOTAL_CREATED=0 +FOLDER_COUNT=0 +START_TIME=$(date +%s) + +for GF_ID in ${GF_IDS}; do + FOLDER_COUNT=$((FOLDER_COUNT + 1)) + FILES_DIR="${GROUPFOLDERS_DIR}/${GF_ID}/files" + + # Maak files directory aan als die niet bestaat + mkdir -p "${FILES_DIR}" + + # Tel bestaande bestanden + EXISTING=$(find "${FILES_DIR}" -maxdepth 1 -name "${FILE_PREFIX}-*.${FILE_EXT}" 2>/dev/null | wc -l) + if [ "${EXISTING}" -ge "${FILES_PER_FOLDER}" ]; then + echo "[SKIP] Folder ${FOLDER_COUNT} (gf_id=${GF_ID}): ${EXISTING} bestanden bestaan al" + TOTAL_CREATED=$((TOTAL_CREATED + EXISTING)) + continue + fi + + # Maak bestanden aan + CREATED=0 + for i in $(seq -w 1 ${FILES_PER_FOLDER}); do + FNAME="${FILE_PREFIX}-${i}.${FILE_EXT}" + FPATH="${FILES_DIR}/${FNAME}" + if [ ! -f "${FPATH}" ]; then + echo "${FILE_CONTENT}" > "${FPATH}" + CREATED=$((CREATED + 1)) + fi + done + + # Fix ownership + chown -R www-data:www-data "${FILES_DIR}" + + TOTAL_CREATED=$((TOTAL_CREATED + FILES_PER_FOLDER)) + ELAPSED=$(( $(date +%s) - START_TIME )) + echo "[DONE] Folder ${FOLDER_COUNT} (gf_id=${GF_ID}): ${CREATED} nieuw aangemaakt (${ELAPSED}s, totaal: ${TOTAL_CREATED})" +done + +ELAPSED=$(( $(date +%s) - START_TIME )) +echo "" +echo "============================================================" +echo "Voltooid in ${ELAPSED} seconden" +echo "Totaal bestanden: ${TOTAL_CREATED}" +echo "============================================================" diff --git a/templates/fast_db_register.py.j2 b/templates/fast_db_register.py.j2 new file mode 100644 index 0000000..0e84b2f --- /dev/null +++ b/templates/fast_db_register.py.j2 @@ -0,0 +1,174 @@ +#!/usr/bin/env python3 +""" +Fast DB Register - Direct MySQL insert into oc_filecache +========================================================= +Registreert bestanden die op het filesystem zijn aangemaakt in de Nextcloud +database (oc_filecache). Veel sneller dan occ files:scan (~500K in ~2 min). +""" +import os +import subprocess +import hashlib +import time + +DB_NAME = "{{ nextcloud_db_name }}" +GROUPFOLDERS_DIR = "{{ groupfolders_dir }}" +NUM_TEAMFOLDERS = {{ num_teamfolders }} +FILES_PER_FOLDER = {{ files_per_teamfolder }} +FILE_PREFIX = "{{ dummy_file_prefix }}" +FILE_EXT = "{{ dummy_file_extension }}" +SQL_BATCH_SIZE = {{ sql_batch_size }} + + +def mysql_exec(sql): + result = subprocess.run( + ["mysql", "-u", "root", DB_NAME, "-N", "-e", sql], + capture_output=True, text=True, timeout=120 + ) + if result.returncode != 0 and result.stderr.strip(): + print(f" [SQL ERROR] {result.stderr[:200]}") + return result.stdout.strip() + + +def mysql_exec_file(filepath): + with open(filepath) as f: + result = subprocess.run( + ["mysql", "-u", "root", DB_NAME, "--max-allowed-packet=64M"], + stdin=f, capture_output=True, text=True, timeout=300 + ) + if result.returncode != 0: + print(f" [SQL ERROR] {result.stderr[:200]}") + return result + + +def escape_sql(s): + return s.replace("\\", "\\\\").replace("'", "\\'") + + +def main(): + start_time = time.time() + now = int(time.time()) + + print("=" * 60) + print("Fast DB Register - oc_filecache insert") + print(f"Doel: {NUM_TEAMFOLDERS} folders x {FILES_PER_FOLDER} bestanden") + print("=" * 60) + + # Get storage mapping (gf_id -> storage numeric_id) + print("\nStap 1: Ophalen storage mapping...") + rows = mysql_exec(""" + SELECT s.numeric_id, + REPLACE(REPLACE(s.id, 'local::/var/www/nextcloud/data/__groupfolders/', ''), '/', '') as folder_num + FROM oc_storages s + WHERE s.id LIKE 'local::%/__groupfolders/%' + """) + storage_map = {} + for line in rows.split('\n'): + if line.strip(): + parts = line.split('\t') + storage_map[int(parts[1])] = int(parts[0]) + + # Get groupfolder mapping + print("Stap 2: Ophalen groupfolder mapping...") + rows = mysql_exec(""" + SELECT folder_id, mount_point FROM oc_group_folders + WHERE mount_point LIKE '{{ teamfolder_prefix }}-%' + ORDER BY folder_id + """) + gf_map = {} + for line in rows.split('\n'): + if line.strip(): + parts = line.split('\t') + gf_id = int(parts[0]) + tf_idx = int(parts[1].split('-')[-1]) + if tf_idx <= NUM_TEAMFOLDERS: + gf_map[tf_idx] = gf_id + + print(f" {len(gf_map)} teamfolders gevonden") + + # Get mimetype IDs + txt_mime = int(mysql_exec("SELECT id FROM oc_mimetypes WHERE mimetype='text/plain'")) + txt_part = int(mysql_exec("SELECT id FROM oc_mimetypes WHERE mimetype='text'")) + + total_inserted = 0 + + for tf_idx in sorted(gf_map.keys()): + gf_id = gf_map[tf_idx] + storage_id = storage_map.get(gf_id) + if not storage_id: + print(f" [SKIP] Geen storage voor gf_id {gf_id}") + continue + + # Check if files already registered + existing = mysql_exec(f""" + SELECT COUNT(*) FROM oc_filecache + WHERE storage = {storage_id} AND path LIKE 'files/{FILE_PREFIX}%' + """) + if existing and int(existing) >= FILES_PER_FOLDER: + print(f" [SKIP] Folder {tf_idx} (gf_id={gf_id}): {existing} bestanden al geregistreerd") + total_inserted += int(existing) + continue + + # Get parent fileid for 'files' directory + files_parent = mysql_exec(f"SELECT fileid FROM oc_filecache WHERE storage = {storage_id} AND path = 'files'") + if not files_parent: + print(f" [SKIP] Geen files entry voor storage {storage_id}") + continue + parent_id = int(files_parent) + + # Get file size from disk + sample_file = f"{GROUPFOLDERS_DIR}/{gf_id}/files/{FILE_PREFIX}-00001.{FILE_EXT}" + try: + file_size = os.path.getsize(sample_file) + except OSError: + file_size = 80 # fallback + + # Generate INSERT statements in batches + values = [] + for i in range(1, FILES_PER_FOLDER + 1): + fname = f"{FILE_PREFIX}-{i:05d}.{FILE_EXT}" + path = f"files/{fname}" + path_hash = hashlib.md5(path.encode()).hexdigest() + etag = hashlib.md5(f"{now}{path}".encode()).hexdigest() + + values.append( + f"({storage_id}, '{escape_sql(path)}', '{path_hash}', {parent_id}, " + f"'{escape_sql(fname)}', {txt_mime}, {txt_part}, {file_size}, " + f"{now}, {now}, '{etag}', 27, 0)" + ) + + if len(values) >= SQL_BATCH_SIZE: + sql_file = "/tmp/filecache_batch.sql" + with open(sql_file, 'w') as f: + f.write("INSERT IGNORE INTO oc_filecache " + "(storage, path, path_hash, parent, name, mimetype, mimepart, " + "size, mtime, storage_mtime, etag, permissions, unencrypted_size) VALUES\n") + f.write(",\n".join(values)) + f.write(";\n") + mysql_exec_file(sql_file) + values = [] + + # Flush remaining + if values: + sql_file = "/tmp/filecache_batch.sql" + with open(sql_file, 'w') as f: + f.write("INSERT IGNORE INTO oc_filecache " + "(storage, path, path_hash, parent, name, mimetype, mimepart, " + "size, mtime, storage_mtime, etag, permissions, unencrypted_size) VALUES\n") + f.write(",\n".join(values)) + f.write(";\n") + mysql_exec_file(sql_file) + + total_inserted += FILES_PER_FOLDER + elapsed = time.time() - start_time + rate = total_inserted / elapsed if elapsed > 0 else 0 + print(f" [DONE] Folder {tf_idx} (gf_id={gf_id}): {FILES_PER_FOLDER} records ({elapsed:.0f}s, totaal: {total_inserted}, {rate:.0f}/s)") + + elapsed = time.time() - start_time + print(f"\n{'='*60}") + print(f"Voltooid in {elapsed:.0f} seconden") + print(f"Totaal geregistreerd: {total_inserted}") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/templates/fast_metadata_insert.py.j2 b/templates/fast_metadata_insert.py.j2 new file mode 100644 index 0000000..21eb6e6 --- /dev/null +++ b/templates/fast_metadata_insert.py.j2 @@ -0,0 +1,247 @@ +#!/usr/bin/env python3 +""" +Fast Metadata Insert - Direct MySQL +===================================== +Voegt metadata records in voor alle bestanden via directe MySQL inserts. +Veel sneller dan de MetaVox API (~43M records in ~2 uur). + +Tabellen: + - oc_metavox_file_gf_meta: per-bestand metadata (file_id, groupfolder_id, field_name, field_value) + - oc_metavox_gf_metadata: per-teamfolder metadata (groupfolder_id, field_name, field_value) +""" +import subprocess +import time +import random +import string +from datetime import datetime, timedelta + +DB_NAME = "{{ nextcloud_db_name }}" +NUM_TEAMFOLDERS = {{ num_teamfolders }} +FILES_PER_FOLDER = {{ files_per_teamfolder }} +SQL_BATCH_SIZE = {{ sql_batch_size }} + +NAMES = ["Jan", "Piet", "Klaas", "Marie", "Anna", "Sophie", "Thomas", "Eva", + "Lucas", "Emma", "Liam", "Olivia", "Noah", "Mia", "Daan", "Sara"] +ORGS = ["UvA", "HvA", "VU", "TU Delft", "Gemeente Amsterdam", "Ministerie BZK", + "Rijkswaterstaat", "UWV", "DUO", "KNAW"] +CITIES = ["Amsterdam", "Rotterdam", "Den Haag", "Utrecht", "Eindhoven", + "Groningen", "Tilburg", "Almere", "Breda", "Nijmegen"] +WORDS = ["beleid", "rapport", "analyse", "voorstel", "evaluatie", "plan", + "nota", "brief", "contract", "factuur", "verslag", "advies"] + +FILE_FIELDS = [ +{% for field in file_metadata_fields %} + ("{{ field.name }}", "{{ field.type }}{% if field.options is defined %}:{{ field.options }}{% endif %}"), +{% endfor %} +] + +TF_FIELDS = [ +{% for field in teamfolder_metadata_fields %} + ("{{ field.name }}", "{{ field.type }}{% if field.options is defined %}:{{ field.options }}{% endif %}"), +{% endfor %} +] + + +def random_date(): + start = datetime(2020, 1, 1) + delta = (datetime(2026, 12, 31) - start).days + return (start + timedelta(days=random.randint(0, delta))).strftime("%Y-%m-%d") + + +def gen_value(fname, ftype): + if ftype.startswith("dropdown:"): + options = ftype.split(":", 1)[1].split(",") + return random.choice(options) + elif ftype == "date": + return random_date() + elif ftype == "checkbox": + return random.choice(["true", "false"]) + else: # text + if "email" in fname: + return f"{random.choice(NAMES).lower()}@example.nl" + elif "phone" in fname: + return f"+31 6 {random.randint(10000000, 99999999)}" + elif "postal" in fname: + return f"{random.randint(1000, 9999)} {''.join(random.choices(string.ascii_uppercase, k=2))}" + elif "city" in fname: + return random.choice(CITIES) + elif "country" in fname: + return "Nederland" + elif "organization" in fname: + return random.choice(ORGS) + elif any(w in fname for w in ["author", "creator", "contact", "owner"]): + return f"{random.choice(NAMES)} {random.choice(['de Vries', 'Jansen', 'Bakker', 'Visser'])}" + elif any(w in fname for w in ["number", "code", "identifier", "reference"]): + return f"{fname[:3].upper()}-{random.randint(10000, 99999)}" + elif "version" in fname: + return f"{random.randint(1, 10)}.{random.randint(0, 9)}" + elif "language" in fname: + return random.choice(["Nederlands", "Engels", "Duits"]) + else: + return " ".join(random.choices(WORDS, k=random.randint(1, 3))).capitalize() + + +def mysql_exec(sql): + result = subprocess.run( + ["mysql", "-u", "root", DB_NAME, "-N", "-e", sql], + capture_output=True, text=True, timeout=120 + ) + if result.returncode != 0: + print(f" [SQL ERROR] {result.stderr[:200]}") + return result.stdout.strip() + + +def mysql_exec_file(filepath): + with open(filepath) as f: + result = subprocess.run( + ["mysql", "-u", "root", DB_NAME, "--max-allowed-packet=64M"], + stdin=f, capture_output=True, text=True, timeout=300 + ) + if result.returncode != 0: + print(f" [SQL ERROR] {result.stderr[:200]}") + return result + + +def escape_sql(s): + return s.replace("\\", "\\\\").replace("'", "\\'").replace('"', '\\"') + + +def main(): + print("=" * 60) + print("Fast Metadata Insert - Direct MySQL") + print(f"File fields: {len(FILE_FIELDS)}, TF fields: {len(TF_FIELDS)}") + print(f"Doel: {NUM_TEAMFOLDERS} folders x {FILES_PER_FOLDER} files x {len(FILE_FIELDS)} fields") + print(f" = {NUM_TEAMFOLDERS * FILES_PER_FOLDER * len(FILE_FIELDS):,} file metadata rows") + print(f" + {NUM_TEAMFOLDERS * len(TF_FIELDS):,} groupfolder metadata rows") + print("=" * 60) + + start_time = time.time() + + # Step 1: Get groupfolder mapping + print("\nStap 1: Ophalen groupfolder mapping...") + rows = mysql_exec(""" + SELECT folder_id, mount_point FROM oc_group_folders + WHERE mount_point LIKE '{{ teamfolder_prefix }}-%' + ORDER BY folder_id + """) + gf_map = {} + for line in rows.split('\n'): + if line.strip(): + parts = line.split('\t') + gf_id = int(parts[0]) + tf_idx = int(parts[1].split('-')[-1]) + gf_map[tf_idx] = gf_id + print(f" {len(gf_map)} groupfolders gevonden") + + # Step 2: Get storage mapping + print("\nStap 2: Ophalen storage mapping...") + rows = mysql_exec(""" + SELECT s.numeric_id, + REPLACE(REPLACE(s.id, 'local::/var/www/nextcloud/data/__groupfolders/', ''), '/', '') as folder_num + FROM oc_storages s + WHERE s.id LIKE 'local::%/__groupfolders/%' + """) + storage_map = {} + for line in rows.split('\n'): + if line.strip(): + parts = line.split('\t') + storage_map[int(parts[1])] = int(parts[0]) + + # Step 3: Check which folders are already complete + print("\nStap 3: Checken welke folders al klaar zijn...") + done_rows = mysql_exec(""" + SELECT groupfolder_id, COUNT(*) as cnt + FROM oc_metavox_file_gf_meta + GROUP BY groupfolder_id + """) + done_gf_ids = set() + target_per_folder = FILES_PER_FOLDER * len(FILE_FIELDS) + for line in (done_rows or "").split('\n'): + if line.strip(): + parts = line.split('\t') + gf_id = int(parts[0]) + cnt = int(parts[1]) + if cnt >= target_per_folder * 0.1: + done_gf_ids.add(gf_id) + print(f" gf_id={gf_id}: {cnt}/{target_per_folder} ({cnt*100//target_per_folder}%) - SKIP") + print(f" {len(done_gf_ids)} folders al (grotendeels) compleet, worden overgeslagen") + + # Step 4: Insert groupfolder metadata + print("\nStap 4: Groupfolder metadata invoegen...") + now = datetime.now().strftime("%Y-%m-%d %H:%M:%S") + values = [] + for tf_idx in range(1, NUM_TEAMFOLDERS + 1): + gf_id = gf_map.get(tf_idx) + if not gf_id: + continue + for fname, ftype in TF_FIELDS: + val = escape_sql(gen_value(fname, ftype)) + values.append(f"({gf_id}, '{fname}', '{val}', '{now}', '{now}')") + + sql_file = "/tmp/gf_metadata_insert.sql" + with open(sql_file, 'w') as f: + f.write("INSERT INTO oc_metavox_gf_metadata (groupfolder_id, field_name, field_value, created_at, updated_at) VALUES\n") + f.write(",\n".join(values)) + f.write(";\n") + mysql_exec_file(sql_file) + print(f" {len(values)} groupfolder metadata records ingevoegd") + + # Step 5: Insert file metadata per folder + print("\nStap 5: File metadata invoegen...") + total_inserted = 0 + + for tf_idx in range(1, NUM_TEAMFOLDERS + 1): + gf_id = gf_map.get(tf_idx) + if not gf_id: + continue + if gf_id in done_gf_ids: + total_inserted += FILES_PER_FOLDER * len(FILE_FIELDS) + continue + storage_id = storage_map.get(gf_id) + if not storage_id: + print(f" [SKIP] Geen storage voor gf_id {gf_id}") + continue + + # Get file IDs for this folder + file_ids_raw = mysql_exec(f""" + SELECT fileid FROM oc_filecache + WHERE storage = {storage_id} AND name LIKE '{{ dummy_file_prefix }}%' + """) + file_ids = [int(x) for x in file_ids_raw.split('\n') if x.strip()] + + if not file_ids: + print(f" [SKIP] Folder {tf_idx}: geen bestanden") + continue + + # Generate all rows for this folder + values = [] + for fid in file_ids: + for fname, ftype in FILE_FIELDS: + val = escape_sql(gen_value(fname, ftype)) + values.append(f"({fid}, {gf_id}, '{fname}', '{val}', '{now}', '{now}')") + + # Write in batches + for batch_start in range(0, len(values), SQL_BATCH_SIZE): + batch = values[batch_start:batch_start + SQL_BATCH_SIZE] + sql_file = "/tmp/file_meta_batch.sql" + with open(sql_file, 'w') as f: + f.write("INSERT INTO oc_metavox_file_gf_meta (file_id, groupfolder_id, field_name, field_value, created_at, updated_at) VALUES\n") + f.write(",\n".join(batch)) + f.write(";\n") + mysql_exec_file(sql_file) + + total_inserted += len(values) + elapsed = time.time() - start_time + rate = total_inserted / elapsed if elapsed > 0 else 0 + print(f" [DONE] Folder {tf_idx} (gf_id={gf_id}): {len(file_ids)} files x {len(FILE_FIELDS)} fields = {len(values)} rows ({elapsed:.0f}s, totaal: {total_inserted:,}, {rate:.0f}/s)") + + elapsed = time.time() - start_time + print(f"\n{'='*60}") + print(f"Voltooid in {elapsed:.0f} seconden ({elapsed/3600:.1f} uur)") + print(f"File metadata records: {total_inserted:,}") + print(f"GF metadata records: {len(gf_map) * len(TF_FIELDS)}") + print(f"{'='*60}") + + +if __name__ == "__main__": + main() diff --git a/templates/setup_metadata_fields.py.j2 b/templates/setup_metadata_fields.py.j2 new file mode 100644 index 0000000..035d0e3 --- /dev/null +++ b/templates/setup_metadata_fields.py.j2 @@ -0,0 +1,191 @@ +#!/usr/bin/env python3 +""" +MetaVox Metadata Velden Setup Script +===================================== +Maakt teamfolder-velden en file-metadata-velden aan via de MetaVox OCS API. + + POST /ocs/v2.php/apps/metavox/api/v1/groupfolder-fields + Payload: field_name, field_label, field_type, field_description, + field_options (array), is_required, sort_order, + applies_to_groupfolder (1=teamfolder metadata, 0=file metadata) + + POST /ocs/v2.php/apps/metavox/api/v1/groupfolders/{id}/fields + Payload: field_ids (array) - wijst velden toe aan een specifieke groupfolder +""" + +import json +import sys +import time +import requests +from requests.auth import HTTPBasicAuth + +NC_URL = "{{ nextcloud_url }}" +NC_USER = "{{ nextcloud_admin_user }}" +NC_PASS = "{{ nextcloud_admin_password }}" +TIMEOUT = {{ http_timeout }} +MAX_RETRIES = {{ max_retries }} + +OCS_BASE = f"{NC_URL}/ocs/v2.php/apps/metavox/api/v1" + +AUTH = HTTPBasicAuth(NC_USER, NC_PASS) +HEADERS = { + "OCS-APIRequest": "true", + "Accept": "application/json", + "Content-Type": "application/json", +} + + +def api_request(method, url, data=None, retries=MAX_RETRIES): + for attempt in range(retries): + try: + resp = requests.request( + method, url, auth=AUTH, headers=HEADERS, + json=data, timeout=TIMEOUT, + ) + if resp.status_code in [200, 201]: + return resp + elif resp.status_code == 500 and "already exists" in resp.text: + print(f" [SKIP] Veld bestaat al") + return resp + else: + print(f" [WARN] HTTP {resp.status_code}: {resp.text[:200]}") + if attempt < retries - 1: + time.sleep(2 ** attempt) + return resp + except requests.exceptions.RequestException as e: + print(f" [ERROR] Poging {attempt+1}/{retries}: {e}") + if attempt < retries - 1: + time.sleep(2 ** attempt) + else: + raise + return None + + +def create_field(field, applies_to_groupfolder): + options = field.get("options", "") + if isinstance(options, str) and options: + options = [o.strip() for o in options.split(",")] + elif not isinstance(options, list): + options = [] + + payload = { + "field_name": field["name"], + "field_label": field.get("description", field["name"]), + "field_type": field["type"], + "field_description": field.get("description", ""), + "field_options": options, + "is_required": False, + "sort_order": 0, + "applies_to_groupfolder": applies_to_groupfolder, + } + + url = f"{OCS_BASE}/groupfolder-fields" + return api_request("POST", url, payload) + + +def assign_fields_to_groupfolder(groupfolder_id, field_ids): + url = f"{OCS_BASE}/groupfolders/{groupfolder_id}/fields" + return api_request("POST", url, {"field_ids": field_ids}) + + +def get_existing_fields(): + url = f"{OCS_BASE}/groupfolder-fields" + resp = api_request("GET", url) + if resp and resp.status_code == 200: + try: + data = resp.json() + if "ocs" in data and "data" in data["ocs"]: + return data["ocs"]["data"] + except Exception: + pass + return [] + + +def get_groupfolders(): + url = f"{OCS_BASE}/groupfolders" + resp = api_request("GET", url) + if resp and resp.status_code == 200: + try: + data = resp.json() + if "ocs" in data and "data" in data["ocs"]: + return data["ocs"]["data"] + except Exception: + pass + return [] + + +def main(): + print("=" * 60) + print("MetaVox Metadata Velden Setup (OCS API)") + print(f"Endpoint: {OCS_BASE}/groupfolder-fields") + print("=" * 60) + + created_field_ids = [] + + # ---- Teamfolder metadata-velden (applies_to_groupfolder=1) ---- + tf_fields = json.loads("""{{ teamfolder_metadata_fields | to_json }}""") + print(f"\n[1/3] {len(tf_fields)} teamfolder-velden (applies_to_groupfolder=1)...") + + tf_created = 0 + for field in tf_fields: + print(f" Aanmaken: {field['name']} ({field['type']})") + resp = create_field(field, applies_to_groupfolder=1) + if resp and resp.status_code in [200, 201]: + tf_created += 1 + try: + fid = resp.json().get("ocs", {}).get("data", {}).get("id") + if fid: + created_field_ids.append(fid) + except Exception: + pass + print(f" -> {tf_created} teamfolder-velden created") + + # ---- File metadata-velden (applies_to_groupfolder=0) ---- + file_fields = json.loads("""{{ file_metadata_fields | to_json }}""") + print(f"\n[2/3] {len(file_fields)} file-velden (applies_to_groupfolder=0)...") + + f_created = 0 + for field in file_fields: + print(f" Aanmaken: {field['name']} ({field['type']})") + resp = create_field(field, applies_to_groupfolder=0) + if resp and resp.status_code in [200, 201]: + f_created += 1 + try: + fid = resp.json().get("ocs", {}).get("data", {}).get("id") + if fid: + created_field_ids.append(fid) + except Exception: + pass + print(f" -> {f_created} file-velden created") + + # ---- Wijs alle velden toe aan alle groupfolders ---- + print(f"\n[3/3] Velden toewijzen aan groupfolders...") + + all_fields = get_existing_fields() + all_field_ids = [f["id"] for f in all_fields if "id" in f] + if not all_field_ids: + all_field_ids = created_field_ids + + if all_field_ids: + groupfolders = get_groupfolders() + gf_count = 0 + for gf in groupfolders: + gf_id = gf.get("id") or gf.get("group_folder_id") + if gf_id: + resp = assign_fields_to_groupfolder(gf_id, all_field_ids) + if resp and resp.status_code == 200: + gf_count += 1 + if gf_count % 10 == 0 and gf_count > 0: + print(f" {gf_count} groupfolders verwerkt...") + print(f" -> {gf_count} groupfolders hebben nu alle velden") + else: + print(" [WARN] Geen field IDs beschikbaar") + + total = tf_created + f_created + print(f"\n{'=' * 60}") + print(f"Totaal: {total} metadata-velden created") + print(f"{'=' * 60}") + + +if __name__ == "__main__": + main()