aboutsummaryrefslogtreecommitdiff
path: root/examples
diff options
context:
space:
mode:
Diffstat (limited to 'examples')
-rw-r--r--examples/json-schema-to-grammar.py132
1 files changed, 132 insertions, 0 deletions
diff --git a/examples/json-schema-to-grammar.py b/examples/json-schema-to-grammar.py
new file mode 100644
index 0000000..2dccc11
--- /dev/null
+++ b/examples/json-schema-to-grammar.py
@@ -0,0 +1,132 @@
+import argparse
+import json
+import re
+import sys
+
+# whitespace is constrained to a single space char to prevent model "running away" in
+# whitespace. Also maybe improves generation quality?
+SPACE_RULE = '" "?'
+
+PRIMITIVE_RULES = {
+ 'boolean': '("true" | "false") space',
+ 'number': '("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space',
+ 'integer': '("-"? ([0-9] | [1-9] [0-9]*)) space',
+ 'string': r''' "\"" (
+ [^"\\] |
+ "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+ )* "\"" space ''',
+ 'null': '"null" space',
+}
+
+INVALID_RULE_CHARS_RE = re.compile(r'[^a-zA-Z0-9-]+')
+GRAMMAR_LITERAL_ESCAPE_RE = re.compile(r'[\r\n"]')
+GRAMMAR_LITERAL_ESCAPES = {'\r': '\\r', '\n': '\\n', '"': '\\"'}
+
+
+class SchemaConverter:
+ def __init__(self, prop_order):
+ self._prop_order = prop_order
+ self._rules = {'space': SPACE_RULE}
+
+ def _format_literal(self, literal):
+ escaped = GRAMMAR_LITERAL_ESCAPE_RE.sub(
+ lambda m: GRAMMAR_LITERAL_ESCAPES.get(m.group(0)), json.dumps(literal)
+ )
+ return f'"{escaped}"'
+
+ def _add_rule(self, name, rule):
+ esc_name = INVALID_RULE_CHARS_RE.sub('-', name)
+ if esc_name not in self._rules or self._rules[esc_name] == rule:
+ key = esc_name
+ else:
+ i = 0
+ while f'{esc_name}{i}' in self._rules:
+ i += 1
+ key = f'{esc_name}{i}'
+ self._rules[key] = rule
+ return key
+
+ def visit(self, schema, name):
+ schema_type = schema.get('type')
+ rule_name = name or 'root'
+
+ if 'oneOf' in schema or 'anyOf' in schema:
+ rule = ' | '.join((
+ self.visit(alt_schema, f'{name}{"-" if name else ""}{i}')
+ for i, alt_schema in enumerate(schema.get('oneOf') or schema['anyOf'])
+ ))
+ return self._add_rule(rule_name, rule)
+
+ elif 'const' in schema:
+ return self._add_rule(rule_name, self._format_literal(schema['const']))
+
+ elif 'enum' in schema:
+ rule = ' | '.join((self._format_literal(v) for v in schema['enum']))
+ return self._add_rule(rule_name, rule)
+
+ elif schema_type == 'object' and 'properties' in schema:
+ # TODO: `required` keyword
+ prop_order = self._prop_order
+ prop_pairs = sorted(
+ schema['properties'].items(),
+ # sort by position in prop_order (if specified) then by key
+ key=lambda kv: (prop_order.get(kv[0], len(prop_order)), kv[0]),
+ )
+
+ rule = '"{" space'
+ for i, (prop_name, prop_schema) in enumerate(prop_pairs):
+ prop_rule_name = self.visit(prop_schema, f'{name}{"-" if name else ""}{prop_name}')
+ if i > 0:
+ rule += ' "," space'
+ rule += fr' {self._format_literal(prop_name)} space ":" space {prop_rule_name}'
+ rule += ' "}" space'
+
+ return self._add_rule(rule_name, rule)
+
+ elif schema_type == 'array' and 'items' in schema:
+ # TODO `prefixItems` keyword
+ item_rule_name = self.visit(schema['items'], f'{name}{"-" if name else ""}item')
+ rule = f'"[" space ({item_rule_name} ("," space {item_rule_name})*)? "]" space'
+ return self._add_rule(rule_name, rule)
+
+ else:
+ assert schema_type in PRIMITIVE_RULES, f'Unrecognized schema: {schema}'
+ return self._add_rule(
+ 'root' if rule_name == 'root' else schema_type,
+ PRIMITIVE_RULES[schema_type]
+ )
+
+ def format_grammar(self):
+ return '\n'.join((f'{name} ::= {rule}' for name, rule in self._rules.items()))
+
+
+def main(args_in = None):
+ parser = argparse.ArgumentParser(
+ description='''
+ Generates a grammar (suitable for use in ./main) that produces JSON conforming to a
+ given JSON schema. Only a subset of JSON schema features are supported; more may be
+ added in the future.
+ ''',
+ )
+ parser.add_argument(
+ '--prop-order',
+ default=[],
+ type=lambda s: s.split(','),
+ help='''
+ comma-separated property names defining the order of precedence for object properties;
+ properties not specified here are given lower precedence than those that are, and are
+ sorted alphabetically
+ '''
+ )
+ parser.add_argument('schema', help='file containing JSON schema ("-" for stdin)')
+ args = parser.parse_args(args_in)
+
+ schema = json.load(sys.stdin if args.schema == '-' else open(args.schema))
+ prop_order = {name: idx for idx, name in enumerate(args.prop_order)}
+ converter = SchemaConverter(prop_order)
+ converter.visit(schema, '')
+ print(converter.format_grammar())
+
+
+if __name__ == '__main__':
+ main()