From 84e09a7d8bc4ab6d658b5cd81295ac0add60be78 Mon Sep 17 00:00:00 2001 From: Evan Jones Date: Sun, 23 Jul 2023 23:58:10 -0400 Subject: llama : add grammar-based sampling (#1773) * llama, main : constrain sampling to grammar * allow loading grammar from file * fix whitespace errors * handle & print parser errors * add comments to grammar syntax and allow newlines where unambiguous * add missing include * support alternates in root rule * fix bugs with empty token and EOS * adjust JSON grammar * remove swp file * rewrite ternary expressions Co-authored-by: Henri Vasserman * use struct for grammar elements and add Unicode support * add unicode escapes * add inverse char ranges * only sample full tokens (no peeking or truncation) * llama : minor style changes blindly applied in online editor - hopefully I didn't break something * update help text * add warning message if EOS is disabled --------- Co-authored-by: Henri Vasserman Co-authored-by: Georgi Gerganov --- grammars/arithmetic.gbnf | 6 ++++++ grammars/chess.gbnf | 13 +++++++++++++ grammars/japanese.gbnf | 7 +++++++ grammars/json.gbnf | 29 +++++++++++++++++++++++++++++ grammars/list.gbnf | 4 ++++ 5 files changed, 59 insertions(+) create mode 100644 grammars/arithmetic.gbnf create mode 100644 grammars/chess.gbnf create mode 100644 grammars/japanese.gbnf create mode 100644 grammars/json.gbnf create mode 100644 grammars/list.gbnf (limited to 'grammars') diff --git a/grammars/arithmetic.gbnf b/grammars/arithmetic.gbnf new file mode 100644 index 0000000..3aa95a9 --- /dev/null +++ b/grammars/arithmetic.gbnf @@ -0,0 +1,6 @@ +root ::= (expr "=" ws term "\n")+ +expr ::= term ([-+*/] term)* +term ::= ident | num | "(" ws expr ")" ws +ident ::= [a-z] [a-z0-9_]* ws +num ::= [0-9]+ ws +ws ::= [ \t\n]* diff --git a/grammars/chess.gbnf b/grammars/chess.gbnf new file mode 100644 index 0000000..ef0fc1b --- /dev/null +++ b/grammars/chess.gbnf @@ -0,0 +1,13 @@ +# Specifies chess moves as a list in algebraic notation, using PGN conventions + +# Force first move to "1. ", then any 1-2 digit number after, relying on model to follow the pattern +root ::= "1. " move " " move "\n" ([1-9] [0-9]? ". " move " " move "\n")+ +move ::= (pawn | nonpawn | castle) [+#]? + +# piece type, optional file/rank, optional capture, dest file & rank +nonpawn ::= [NBKQR] [a-h]? [1-8]? "x"? [a-h] [1-8] + +# optional file & capture, dest file & rank, optional promotion +pawn ::= ([a-h] "x")? [a-h] [1-8] ("=" [NBKQR])? + +castle ::= "O-O" "-O"? diff --git a/grammars/japanese.gbnf b/grammars/japanese.gbnf new file mode 100644 index 0000000..43f25ab --- /dev/null +++ b/grammars/japanese.gbnf @@ -0,0 +1,7 @@ +# A probably incorrect grammar for Japanese +root ::= jp-char+ ([ \t\n] jp-char+)* +jp-char ::= hiragana | katakana | punctuation | cjk +hiragana ::= [ぁ-ゟ] +katakana ::= [ァ-ヿ] +punctuation ::= [、-〾] +cjk ::= [一-鿿] diff --git a/grammars/json.gbnf b/grammars/json.gbnf new file mode 100644 index 0000000..40fa2b6 --- /dev/null +++ b/grammars/json.gbnf @@ -0,0 +1,29 @@ +# Grammar for subset of JSON - doesn't support full string or number syntax + +root ::= object +value ::= object | array | string | number | boolean | "null" + +object ::= + "{" ws ( + string ":" ws value + ("," ws string ":" ws value)* + )? "}" + +array ::= + "[" ws ( + value + ("," ws value)* + )? "]" + +string ::= + "\"" ( + [^"\\] | + "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes + )* "\"" ws + +# Only plain integers currently +number ::= "-"? [0-9]+ ws +boolean ::= ("true" | "false") ws + +# Optional space: by convention, applied in this grammar after literal chars when allowed +ws ::= ([ \t\n] ws)? diff --git a/grammars/list.gbnf b/grammars/list.gbnf new file mode 100644 index 0000000..51e6c9c --- /dev/null +++ b/grammars/list.gbnf @@ -0,0 +1,4 @@ +root ::= item+ + +# Excludes various line break characters +item ::= "- " [^\r\n\x0b\x0c\x85\u2028\u2029]+ "\n" -- cgit v1.2.3