#!/usr/bin/env python3 """ Dump the structural schema of a compliance xlsx file as JSON. Usage: python3 dump_xlsx_schema.py Output: { "sheets": [ { "name": "SheetName", "columns": ["Col A", "Col B", ...], "row_count": 150, "metric_values": ["2.3.4i", "5.2.4", ...] // only if a Metric column exists }, ... ] } Dependencies: openpyxl (already in requirements.txt) """ import sys import json from openpyxl import load_workbook def main(): if len(sys.argv) < 2: print(json.dumps({'error': 'No file path provided'})) sys.exit(1) filepath = sys.argv[1] try: wb = load_workbook(filepath, read_only=True, data_only=True) except Exception as e: print(json.dumps({'error': f'Cannot open file: {str(e)}'})) sys.exit(1) sheets = [] for sheet_name in wb.sheetnames: ws = wb[sheet_name] rows = list(ws.iter_rows(max_row=1, values_only=True)) columns = [str(c).strip() for c in rows[0] if c is not None] if rows else [] # Count data rows (excluding header) row_count = 0 for _ in ws.iter_rows(min_row=2, values_only=True): row_count += 1 # Extract metric values if a Metric column exists in the Summary sheet metric_values = [] if sheet_name == 'Summary': # Summary has header at row 4 (0-indexed row 3), read from row 5 onward header_rows = list(ws.iter_rows(min_row=4, max_row=4, values_only=True)) if header_rows: summary_cols = [str(c).strip() if c else '' for c in header_rows[0]] metric_idx = None for i, col in enumerate(summary_cols): if col == 'Metric': metric_idx = i break if metric_idx is not None: for row in ws.iter_rows(min_row=5, values_only=True): if row[metric_idx] is not None: val = str(row[metric_idx]).strip() if val and val != 'Metric': metric_values.append(val) entry = { 'name': sheet_name, 'columns': columns, 'row_count': row_count, } if metric_values: entry['metric_values'] = sorted(set(metric_values)) sheets.append(entry) wb.close() print(json.dumps({'sheets': sheets}, indent=2)) if __name__ == '__main__': main()