#!/usr/bin/env python3
import argparse
import glob
import json
import xml.etree.ElementTree as ET
import re

# Argument parser
arg_parser = argparse.ArgumentParser(description='CMake built-in command signature extractor.')
arg_parser.add_argument('--output', type=str, help='Output file path for the json result.', required=True)
arg_parser.add_argument('--input_commands', type=str, help='Folder containing parsed cmake commands to html.')
arg_parser.add_argument('--input_module_commands', type=str, help='Folder containing parsed cmake commands of modules to html.')


# Helpers
def flat_map(func, lst):
    return flatten(map(func, lst))


def flatten(lst):
    return [item for sublist in lst for item in sublist]


def get_builtin_command_name(root):
    return root.find(".//h1[1]/../span").attrib['id'].removeprefix('command:')


def process_command_text(text):
    return {"text": text}


def remove_comments(text):
    return re.sub(r'#[a-zA-Z0-9 ._-]+(<|\n)', ' ', text)


def replace_multiple_whitespaces(text):
    return ' '.join(text.split())


def is_command(text):
    temp = re.search(r"\s*^([a-zA-Z_]+)\( ?([#a-zA-Z0-9[\]_<>|\"\.-]+( [#a-zA-Z0-9[\]_<>|()\"\.-]+)*)? ?\)\s*$", text)
    if not temp:
        return False
    else:
        return any(map(lambda x: x.islower() and x.isalpha(), temp.group(1)))


def get_module_name(xml):
    return ET.tostring(xml.findall(".//section/h1")[0], method="text").decode().strip()


def get_command_name_from_sig(signature):
    return signature[:signature.find('(')]


# Processing builtin command docs
def process_builtin_command_block(code_block, cmd_name):
    # Gets the code block text
    text = ET.tostring(code_block, method="text").decode().strip()

    # Split the commands in the block and filter out irrelevant one
    text_cmds = []
    start = 0
    for ch in range(0, len(text)):
        if text[ch] == ')':
            temp = text[start:ch + 1].strip()
            if temp.startswith(cmd_name):
                text_cmds.append(process_command_text(replace_multiple_whitespaces(temp)))
            start = ch + 1

    return text_cmds


def process_builtin_command_file(html_text):
    # Runs the parser
    root = ET.fromstring(html_text)

    # Gets the command name
    cmd_name = get_builtin_command_name(root)

    # Gets code blocks
    code_blocks = list(filter(lambda x: cmd_name in ET.tostring(x).decode(), root.findall(".//pre")))

    # Identifies a text containing command signatures
    variants = flat_map(lambda x: process_builtin_command_block(x, cmd_name), code_blocks)

    return {"commandName": cmd_name, "variants": variants}


# Processing module command docs
def process_module_command_block(xml_block):
    text = replace_multiple_whitespaces(remove_comments(ET.tostring(xml_block, method="text").decode().strip()))

    # Split the commands in the block and filter out irrelevant one
    text_cmds = []
    start = 0
    parenthesses_count = 0
    in_func_arg = False
    func_started = False
    for ch in range(0, len(text)):
        if text[ch] == '(':
            parenthesses_count += 1
            in_func_arg = True
            func_started = True
        elif text[ch] == ')':
            parenthesses_count -= 1
            if parenthesses_count == 0:
                in_func_arg = False

        if func_started and not in_func_arg:
            func_started = False
            temp = text[start:ch + 1].strip()
            text_cmds.append(process_command_text(replace_multiple_whitespaces(temp)))
            start = ch + 1

    return text_cmds


def get_signatures(root):
    # Gets code blocks
    code_blocks = root.findall(".//pre")

    # Cleans code block
    text_only = list(flat_map(process_module_command_block, code_blocks))

    # Filters commands only
    filtered = list(filter(lambda x: is_command(x["text"]), text_only))

    return list(map(lambda x: {"commandName": get_command_name_from_sig(x["text"]), "text": x["text"]}, filtered))


# Customized parsing
def parse_module_external_project(root):
    cmd_roots = root.findall(".//dl[@class='cmake command']")

    result = []
    for cmd_root in cmd_roots:
        cmd_name = cmd_root.find('dt').get('id')
        sigs = get_signatures(cmd_root)

        if cmd_name == "command:externalproject_add":
            options = list(map(lambda x: ET.tostring(x, method="text").decode().strip(), cmd_root.findall(".//dt/code")))
            for sig in sigs:
                sig['text'] = sig["text"].replace("<option>", '{ ' + ' | '.join(options) + ' }')
        elif cmd_name == "command:externalproject_add_step":
            for sig in sigs:
                options = list(map(lambda x: ET.tostring(x, method="text").decode().strip(), cmd_root.findall(".//dt/code")))
                sig['text'] = sig["text"].replace("<option>", '{ ' + ' | '.join(options) + ' }') \
                    .replace('<step>', '{ `mkdir` | `download` | `update` | `patch` | `configure` | `build` | `install` | `test` }')
        result += sigs

    return result


def process_module_command_file(html_text):
    # Runs the parser
    root = ET.fromstring(html_text)

    name = get_module_name(root)

    # Has to be extracted separately due to a lot of keywords which are not placed in signatures.
    if name == "ExternalProject":
        return {
            "moduleName": name,
            "commands": group_by_command_name(parse_module_external_project(root))
        }

    return {
        "moduleName": name,
        "commands": group_by_command_name(list(map(lambda x: {"commandName": get_command_name_from_sig(x["text"]), "text": x["text"]},
                                                   get_signatures(root))))
    }


def process_files(process_file, file_paths):
    def process(file_path):
        html_text = None
        with open(file_path, "r") as f:
            html_text = f.read()

        return process_file(html_text)

    return list(map(process, file_paths))


def group_by_command_name(parsed_cmd):
    temp = {}

    for cmd in parsed_cmd:
        if cmd["commandName"] in temp:
            temp[cmd["commandName"]]["variants"].append({"text": cmd["text"]})
        else:
            temp[cmd["commandName"]] = {
                "commandName": cmd["commandName"],
                "variants": [{"text": cmd["text"]}]
            }

    return list(temp.values())


def main(args):
    json_result = {"modules": []}

    # Processes builtin commands
    builtin_commands = []
    if args.input_commands is not None:
        command_files = glob.glob(args.input_commands + '\\*.html')
        builtin_commands = process_files(process_builtin_command_file, command_files)

    # Processes module commands
    module_commands = []
    if args.input_module_commands is not None:
        module_command_files = glob.glob(args.input_module_commands + '\\*.html')
        # Flats the commands
        # Delete builtin commands
        modules = process_files(process_module_command_file, module_command_files)
        builtin_command_names = list(map(lambda x: x["commandName"], builtin_commands))
        modules = map(lambda module: {"moduleName": module["moduleName"],
                                      "commands": [cmd for cmd in module["commands"] if not (cmd["commandName"] in builtin_command_names)]}
                      , modules)

    json_result["module"].append({"moduleName": "builtin", "commands": builtin_commands})
    json_result["module"].extend([module for module in modules if len(module["commands"]) > 0])

    # Saves as JSON
    with open(args.output, "w") as outfile:
        json.dump(json_result, outfile)


if __name__ == "__main__":
    main(arg_parser.parse_args())
