Last active
December 31, 2021 11:21
-
-
Save RomainTT/93f7190ce925f00d439f1465b944cb0c to your computer and use it in GitHub Desktop.
Lark file for CSV schema v1.2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// RULES // ----- schema : prolog body | |
prolog : versiondecl globaldirectives versiondecl : ("version 1.0" | "version 1.1" | "version 1.2") | |
globaldirectives : separatordirective? quoteddirective? totalcolumnsdirective? permitemptydirective? (noheaderdirective | ignorecolumnnamecasedirective)? directiveprefix : "@" | |
separatordirective : directiveprefix "separator" (separatortabexpr | separatorchar) | |
separatortabexpr : "tab" | "\t" | |
separatorchar : CHARACTERLITERAL | |
quoteddirective : directiveprefix "quoted" | |
totalcolumnsdirective : directiveprefix "totalColumns" POSITIVENONZEROINTEGERLITERAL | |
permitemptydirective : directiveprefix "permitEmpty" | |
noheaderdirective : directiveprefix "noHeader" | |
ignorecolumnnamecasedirective : directiveprefix "ignoreColumnNameCase" | |
body : bodypart+ | |
bodypart : comment* columndefinition comment* | |
comment : singlelinecomment | multilinecomment | |
singlelinecomment : /\/\/[\S\t ]*/ | |
multilinecomment : /\\\*(?:[^*\r\n]+|(?:\r?\n))*\\\*/ | |
columndefinition : (columnidentifier | quotedcolumnidentifier) ":" columnrule | |
columnidentifier : POSITIVENONZEROINTEGERLITERAL | IDENT | |
quotedcolumnidentifier : STRINGLITERAL | |
columnrule : columnvalidationexpr* columndirectives | |
columndirectives : optionaldirective? matchisfalsedirective? ignorecasedirective? warningdirective? | |
optionaldirective : directiveprefix "optional" | |
matchisfalsedirective : directiveprefix "matchIsFalse" | |
ignorecasedirective : directiveprefix "ignoreCase" | |
warningdirective : directiveprefix "warningDirective" | |
columnvalidationexpr : combinatorialexpr | noncombinatorialexpr | |
combinatorialexpr : orexpr | andexpr | |
orexpr : noncombinatorialexpr "or" columnvalidationexpr | |
andexpr : noncombinatorialexpr "and" columnvalidationexpr | |
noncombinatorialexpr : nonconditionalexpr | conditionalexpr | |
nonconditionalexpr : singleexpr | externalsingleexpr | parenthesizedexpr | |
singleexpr : explicitcontextexpr? (isexpr | anyexpr | notexpr | inexpr | startswithexpr | endswithexpr | regexpexpr | rangeexpr | lengthexpr | emptyexpr | notemptyexpr | uniqueexpr | uriexpr | xsddatetimeexpr | xsddatetimewithtimezoneexpr | xsddateexpr | xsdtimeexpr | ukdateexpr | dateexpr | partialukdateexpr | partialdateexpr | uuid4expr | positiveintegerexpr | uppercaseexpr | lowercaseexpr | identicalexpr) | |
explicitcontextexpr : columnref "/" | |
columnref : "$" (columnidentifier | quotedcolumnidentifier) | |
isexpr : "is(" stringprovider ")" | |
anyexpr : "any(" stringprovider ")" | |
notexpr : "not(" stringprovider ")" | |
inexpr : "in(" stringprovider ")" | |
startswithexpr : "starts(" stringprovider ")" | |
endswithexpr : "ends(" stringprovider ")" | |
regexpexpr : "regex(" STRINGLITERAL ")" | |
rangeexpr : "range(" (numericorany "," NUMERICLITERAL | NUMERICLITERAL "," numericorany) ")" | |
numericorany : NUMERICLITERAL | WILDCARDLITERAL | |
lengthexpr : "length(" (positiveintegerorany ",")? positiveintegerorany ")" | |
positiveintegerorany : POSITIVEINTEGERLITERAL | WILDCARDLITERAL | |
emptyexpr : "empty" | |
notemptyexpr : "notEmpty" | |
uniqueexpr : "unique" ("(" columnref ("," columnref)* ")")? | |
uriexpr : "uri" | |
xsddatetimeexpr : "xdatetime" ("(" XSDDATETIMELITERAL "," XSDDATETIMELITERAL ")")? | |
xsddatetimewithtimezoneexpr : "xdatetimetz" ("(" XSDDATETIMEWITHTIMEZONELITERAL "," XSDDATETIMEWITHTIMEZONELITERAL ")")? | |
xsddateexpr : "xdate" ("(" XSDDATELITERAL "," XSDDATELITERAL ")")? | |
xsdtimeexpr : "xtime" ("(" XSDTIMELITERAL "," XSDTIMELITERAL ")")? | |
ukdateexpr : "ukdate" ("(" UKDATELITERAL "," UKDATELITERAL ")")? | |
dateexpr : "date(" stringprovider "," stringprovider "," stringprovider ("," XSDDATELITERAL "," XSDDATELITERAL)? ")" | |
partialukdateexpr : "partukdate" | |
partialdateexpr : "partdate(" stringprovider "," stringprovider "," stringprovider ")" | |
uuid4expr : "uuid4" | |
positiveintegerexpr : "positiveinteger" | |
uppercaseexpr : "uppercase" | |
lowercaseexpr : "lowercase" | |
identicalexpr : "positiveinteger" | |
externalsingleexpr : explicitcontextexpr? (fileexistsexpr | integritycheckexpr | checksumexpr | filecountexpr) | |
fileexistsexpr : "fileexists" ("(" stringprovider ")")? | |
integritycheckexpr : "integritycheck" "(" (stringprovider ",")? (stringprovider ",")? ("\"includefolder\"" | "\"excludefolder\"") ")" | |
checksumexpr : "checksum(" fileexpr "," STRINGLITERAL ")" | |
fileexpr : "file(" (stringprovider ",")? stringprovider ")" | |
filecountexpr : "filecount(" fileexpr ")" | |
stringprovider : columnref | STRINGLITERAL | concatexpr | noextexpr | uridecodeexpr | |
concatexpr : "concat(" stringprovider ("," stringprovider)+ ")" | |
noextexpr : "noext(" stringprovider ")" | |
uridecodeexpr : "uridecode(" stringprovider ("," stringprovider)? ")" | |
parenthesizedexpr : "(" columnvalidationexpr+ ")" | |
conditionalexpr : ifexpr | switchexpr | |
ifexpr : "if(" (combinatorialexpr | nonconditionalexpr) "," columnvalidationexpr+ ("," columnvalidationexpr+)? ")" | |
switchexpr : "switch(" switchcaseexpr+ ("," columnvalidationexpr+)? ")" | |
switchcaseexpr : "if(" (combinatorialexpr | nonconditionalexpr) "," columnvalidationexpr+ ")" | |
// TERMINALS | |
// --------- | |
XSDDATETIMELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT "t" XSDTIMELITERAL | |
XSDDATETIMEWITHTIMEZONELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT "t" XSDTIMEWITHOUTTIMEZONECOMPONENT XSDTIMEZONECOMPONENT | |
XSDDATELITERAL : XSDDATEWITHOUTTIMEZONECOMPONENT XSDOPTIONALTIMEZONECOMPONENT | |
XSDTIMELITERAL : XSDTIMEWITHOUTTIMEZONECOMPONENT XSDTIMEZONECOMPONENT | |
XSDDATEWITHOUTTIMEZONECOMPONENT : /[0-9]{4}-(((0(1|3|5|7|8)|1(0|2))-(0[1-9]|(1|2)[0-9]|3[0-1]))|((0(4|6|9)|11)-(0[1-9]|(1|2)[0-9]|30))|(02-(0[1-9]|(1|2)[0-9])))/ | |
XSDTIMEWITHOUTTIMEZONECOMPONENT : /([0-1][0-9]|2[0-4]):(0[0-9]|[1-5][0-9]):(0[0-9]|[1-5][0-9])(\.[0-9]{3})?/ | |
XSDOPTIONALTIMEZONECOMPONENT : /((\+|-)(0[1-9]|1[0-9]|2[0-4]):(0[0-9]|[1-5][0-9])|z)/ | |
XSDTIMEZONECOMPONENT : /((\+|-)(0[1-9]|1[0-9]|2[0-4]):(0[0-9]|[1-5][0-9])|z)/ | |
UKDATELITERAL : /(((0[1-9]|(1|2)[0-9]|3[0-1])\/(0(1|3|5|7|8)|1(0|2)))|((0[1-9]|(1|2)[0-9]|30)\/(0(4|6|9)|11))|((0[1-9]|(1|2)[0-9])\/02))\/[0-9]{4}/ | |
POSITIVENONZEROINTEGERLITERAL : /[1-9][0-9]*/ | |
POSITIVEINTEGERLITERAL : /[0-9]+/ | |
NUMERICLITERAL : /[0-9]+(\.[0-9]+)?/ | |
STRINGLITERAL : "\"" /[^"]+/ "\"" | |
CHARACTERLITERAL : "'" /[^\r\n\f']/ "'" | |
WILDCARDLITERAL : "*" | |
IDENT : /[a-za-z0-9\-_\.]+/ | |
// LARK SPECIFIC | |
// ------------- | |
%import common.NEWLINE | |
%import common.WS | |
%ignore NEWLINE | |
%ignore WS |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Original grammar is found here.
Lark is found here
Here are the changes between the original grammar and this Lark file:
/…/
*
have been changed to+
%import
and%ignore
statements-
is not valid with Lark. These have been removed.?…?
are not understood by Lark. In case of regexp, it has been replaced by/…/
, otherwise removed.