-
-
Save pmarreck/4626713 to your computer and use it in GitHub Desktop.
# RFC 5322 Email Validation Regex in Ruby | |
# This work is released under the BSD 3-Clause License | |
# http://choosealicense.com/licenses/bsd-3-clause/ | |
# Copyleft (ɔ) 2013, Peter R. Marreck | |
# All rights reserved. | |
# | |
# Redistribution and use in source and binary forms, with or without modification, | |
# are permitted provided that the following conditions are met: | |
# | |
# Redistributions of source code must retain the above copyright notice, this | |
# list of conditions and the following disclaimer. | |
# | |
# Redistributions in binary form must reproduce the above copyright notice, this | |
# list of conditions and the following disclaimer in the documentation and/or | |
# other materials provided with the distribution. | |
# | |
# Neither the name of the {organization} nor the names of its | |
# contributors may be used to endorse or promote products derived from | |
# this software without specific prior written permission. | |
# | |
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND | |
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED | |
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE | |
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR | |
# ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES | |
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; | |
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON | |
# ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT | |
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS | |
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |
EMAIL = / | |
(?<addr_spec> (?> \g<local_part> @ \g<domain> ) ){0} | |
(?<local_part> (?> \g<dot_atom> | \g<quoted_string> | \g<obs_local_part> ) ){0} | |
(?<domain> (?> \g<dot_atom> | \g<domain_literal> | \g<obs_domain> ) ){0} | |
(?<domain_literal> (?> \g<CFWS>? \[ (?: \g<FWS>? \g<dtext> )* \g<FWS>? \] \g<CFWS>? ) ){0} | |
(?<dtext> (?> [\x21-\x5a] | [\x5e-\x7e] | \g<obs_dtext> ) ){0} | |
(?<quoted_pair> (?> \\ (?: \g<VCHAR> | \g<WSP> ) | \g<obs_qp> ) ){0} | |
(?<dot_atom> (?> \g<CFWS>? \g<dot_atom_text> \g<CFWS>? ) ){0} | |
(?<dot_atom_text> (?> \g<atext> (?: \. \g<atext> )* ) ){0} | |
(?<atext> (?> [a-zA-Z0-9!\#\$%&'*\+\/\=\?\^_`{\|}~\-]+ ) ){0} | |
(?<atom> (?> \g<CFWS>? \g<atext> \g<CFWS>? ) ){0} | |
(?<word> (?> \g<atom> | \g<quoted_string> ) ){0} | |
(?<quoted_string> (?> \g<CFWS>? " (?: \g<FWS>? \g<qcontent> )* \g<FWS>? " \g<CFWS>? ) ){0} | |
(?<qcontent> (?> \g<qtext> | \g<quoted_pair> ) ){0} | |
(?<qtext> (?> \x21 | [\x23-\x5b] | [\x5d-\x7e] | \g<obs_qtext> ) ){0} | |
# comments and whitespace | |
(?<FWS> (?> (?: \g<WSP>* \r\n )? \g<WSP>+ | \g<obs_FWS> ) ){0} | |
(?<CFWS> (?> (?: \g<FWS>? \g<comment> )+ \g<FWS>? | \g<FWS> ) ){0} | |
(?<comment> (?> \( (?: \g<FWS>? \g<ccontent> )* \g<FWS>? \) ) ){0} | |
(?<ccontent> (?>\g<ctext> | \g<quoted_pair> | \g<comment> ) ){0} | |
(?<ctext> (?> [\x21-\x27] | [\x2a-\x5b] | [\x5d-\x7e] | \g<obs_ctext> ) ){0} | |
# obsolete tokens | |
(?<obs_domain> (?> \g<atom> (?: \. \g<atom> )* ) ){0} | |
(?<obs_local_part> (?> \g<word> (?: \. \g<word> )* ) ){0} | |
(?<obs_dtext> (?> \g<obs_NO_WS_CTL> | \g<quoted_pair> ) ){0} | |
(?<obs_qp> (?> \\ (?: \x00 | \g<obs_NO_WS_CTL> | \n | \r ) ) ){0} | |
(?<obs_FWS> (?> \g<WSP>+ (?: \r\n \g<WSP>+ )* ) ){0} | |
(?<obs_ctext> (?> \g<obs_NO_WS_CTL> ) ){0} | |
(?<obs_qtext> (?> \g<obs_NO_WS_CTL> ) ){0} | |
(?<obs_NO_WS_CTL> (?> [\x01-\x08] | \x0b | \x0c | [\x0e-\x1f] | \x7f ) ){0} | |
# character class definitions | |
(?<VCHAR> (?> [\x21-\x7E] ) ){0} | |
(?<WSP> [ \t] ){0} | |
\g<addr_spec> | |
/uix |
This basically only checks for the presence of the @ sign. I haven't found a single case where the @ sign is present and this doesn't match.
@shreve That's a valid criticism- easily fixed I think by adding a ^ to the front of the regex and a $ to the end.
Apparently this needs a test suite. I will look into adding that
@pmarreck I wouldn't use ^
or $
since they match the start and end of a line. Since folding allows multiple lines you should use \A
and \Z
to match the start and end of the whole string.
@pmarreck You may need to do some search & replace, but this is a good start for a test suite. 4 cases are failing currently though!:
describe 'input validation' do
# Test cases copied from https://blogs.msdn.microsoft.com/testing123/2009/02/06/email-address-test-cases/
describe 'invalid cases' do
specify 'Missing @ sign and domain' do
expect { described_class.coerce_non_null_input("plainaddress", context) }.to raise_error(GraphQL::CoercionError)
end
xspecify 'Garbage' do
expect { described_class.coerce_non_null_input("#@%^%#$@#$@#.com", context) }.to raise_error(GraphQL::CoercionError)
end
specify 'Missing username' do
expect { described_class.coerce_non_null_input("@domain.com", context) }.to raise_error(GraphQL::CoercionError)
end
specify 'Smith <[email protected]> Encoded html within email is invalid' do
expect { described_class.coerce_non_null_input("Joe", context) }.to raise_error(GraphQL::CoercionError)
end
specify 'Missing @' do
expect { described_class.coerce_non_null_input("email.domain.com", context) }.to raise_error(GraphQL::CoercionError)
end
specify 'Two @ sign' do
expect { described_class.coerce_non_null_input("email@[email protected]", context) }.to raise_error(GraphQL::CoercionError)
end
specify 'Leading dot in address is not allowed' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.to raise_error(GraphQL::CoercionError)
end
specify 'Trailing dot in address is not allowed' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.to raise_error(GraphQL::CoercionError)
end
specify 'Multiple dots' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.to raise_error(GraphQL::CoercionError)
end
specify 'Unicode char as address' do
expect { described_class.coerce_non_null_input("あいうえお@domain.com", context) }.to raise_error(GraphQL::CoercionError)
end
xspecify 'Text followed email is not allowed' do
expect { described_class.coerce_non_null_input("[email protected] (Joe Smith)", context) }.to raise_error(GraphQL::CoercionError)
end
xspecify 'Leading dash in front of domain is invalid' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.to raise_error(GraphQL::CoercionError)
end
xspecify 'Invalid IP format' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.to raise_error(GraphQL::CoercionError)
end
specify 'Multiple dot in the domain portion is invalid' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.to raise_error(GraphQL::CoercionError)
end
end
describe 'valid cases' do
specify 'Valid email' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
specify 'Email contains dot in the address field' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
specify 'Email contains dot with subdomain' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
specify 'Plus sign is considered valid character' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
specify 'Domain is valid IP address' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
specify 'Square bracket around IP address is considered valid' do
expect { described_class.coerce_non_null_input("email@[123.123.123.123]", context) }.not_to raise_error
end
specify 'Quotes around email is considered valid' do
expect { described_class.coerce_non_null_input("\"email\"@domain.com", context) }.not_to raise_error
end
specify 'Digits in address are valid' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
specify 'Dash in domain name is valid' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
specify 'Underscore in the address field is valid' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
specify '.name is valid Top Level Domain name' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
specify 'Dot in Top Level Domain name also considered valid (use co.jp as example here)' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
specify 'Dash in address field is valid' do
expect { described_class.coerce_non_null_input("[email protected]", context) }.not_to raise_error
end
end
end
@shreve That's a valid criticism- easily fixed I think by adding a ^ to the front of the regex and a $ to the end.
Apparently this needs a test suite. I will look into adding that
As you might remember, I've used it in my "gravaty" gem (see: https://savannah.nongnu.org/projects/gravaty/) and I've used RFC example to test it. There is at least one situation where it matches but it shouldn't: https://savannah.nongnu.org/bugs/?39928
@shreve That's a valid criticism- easily fixed I think by adding a ^ to the front of the regex and a $ to the end.
Apparently this needs a test suite. I will look into adding that
I've done something, since I'm using your gist into my code: http://hg.savannah.nongnu.org/hgweb/gravaty/file/29f7ab331849/test/gravaty/utils/test_rfc5322.rb
And this http://hg.savannah.nongnu.org/hgweb/gravaty/file/29f7ab331849/lib/gravaty/utils/rfc5322.rb is the file I'm testing.
@axos88 Did you ever make those 4 pass? Considering digging this guy out again to polish it up
Email validation is basically the proof that getting 95% of the way there (wherever "there" is) is not hard, but getting 100% of the way there is an order of magnitude (or more) harder 😊
For the record, I may not have even confirmed this actually works- I just tried to make it as "technically correct" as possible. This was something I did "for fun" after work one day while trying to make sense of the RFC after a coworker was working on some email parsing issues. It could really, for example, use a test suite asserting its validity. I welcome any enhancements to this work, though!
In particular, it seems to validate inputs like "Firstname Lastname" [email protected] just fine but it won't put the quoted part into the quoted_strings matcher part of the regex match object. Little details like that...