#!/usr/bin/env python3 """ Extract the structural schema of a compliance xlsx file as JSON. Usage: python3 extract_xlsx_schema.py Output: { "sheets": [ { "name": "Summary", "columns": ["Metric", "Non-Compliant", "..."], "metric_values": ["2.3.4i", "5.2.4", "..."] }, { "name": "2.3.4i", "columns": ["Preferred - Hostname", "GRANITE - IPv4_Address", "..."] } ] } - Uses openpyxl in read-only mode. - Extracts sheet names, first-row column headers per sheet, and unique metric values from the Summary sheet (header at row 4, data from row 5 onward). - On error, returns { "error": "..." } on stdout and exits with non-zero code. Dependencies: openpyxl (already in requirements.txt) """ import sys import json from openpyxl import load_workbook def main(): if len(sys.argv) < 2: print(json.dumps({"error": "No file path provided"})) sys.exit(1) filepath = sys.argv[1] try: wb = load_workbook(filepath, read_only=True, data_only=True) except Exception as e: print(json.dumps({"error": f"Cannot open file: {str(e)}"})) sys.exit(1) if not wb.sheetnames: print(json.dumps({"error": "Workbook contains no sheets"})) wb.close() sys.exit(1) sheets = [] for sheet_name in wb.sheetnames: ws = wb[sheet_name] # Extract first-row column headers rows = list(ws.iter_rows(max_row=1, values_only=True)) columns = [str(c).strip() for c in rows[0] if c is not None] if rows else [] entry = { "name": sheet_name, "columns": columns, } # Extract metric values from the Summary sheet # Summary has header at row 4, data from row 5 onward if sheet_name == "Summary": metric_values = [] header_rows = list(ws.iter_rows(min_row=4, max_row=4, values_only=True)) if header_rows: summary_cols = [str(c).strip() if c else "" for c in header_rows[0]] metric_idx = None for i, col in enumerate(summary_cols): if col == "Metric": metric_idx = i break if metric_idx is not None: for row in ws.iter_rows(min_row=5, values_only=True): if row[metric_idx] is not None: val = str(row[metric_idx]).strip() if val and val != "Metric": metric_values.append(val) entry["metric_values"] = sorted(set(metric_values)) sheets.append(entry) wb.close() print(json.dumps({"sheets": sheets})) if __name__ == "__main__": main()