Last active
January 4, 2016 21:29
-
-
Save johnmyleswhite/8681455 to your computer and use it in GitHub Desktop.
Code for recognizing and normalizing identifiers in Julia
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function isidentifier(s::String) | |
n = length(s) | |
if n == 0 | |
return false | |
end | |
i = 0 | |
for c in s | |
i += 1 | |
if i == 1 | |
if !(isalpha(c) || c == '_') | |
return false | |
end | |
else | |
if !(isalpha(c) || isdigit(c) || c == '_' || c == '!') | |
return false | |
end | |
end | |
end | |
return true | |
end | |
function makeidentifier(s::String) | |
n = endof(s) | |
if n == 0 | |
return "x" | |
end | |
ind = nextind(s, 0) | |
res = Array(Char, 0) | |
if ind <= n | |
c = s[ind] | |
end | |
while !(isalpha(c) || isdigit(c) || c == '_' || c == '!') | |
ind = nextind(s, ind) | |
if ind <= n | |
c = s[ind] | |
end | |
end | |
if ind > n | |
return "x" | |
end | |
if isalpha(c) || c == '_' | |
push!(res, c) | |
ind = nextind(s, ind) | |
if ind <= n | |
c = s[ind] | |
end | |
else # isdigit(c) || c == '!' | |
push!(res, 'x') | |
push!(res, c) | |
ind = nextind(s, ind) | |
if ind <= n | |
c = s[ind] | |
end | |
end | |
while ind <= n | |
if !(isalpha(c) || isdigit(c) || c == '_' || c == '!') | |
while ind <= n && | |
!(isalpha(c) || isdigit(c) || c == '_' || c == '!') | |
ind = nextind(s, ind) | |
if ind <= n | |
c = s[ind] | |
end | |
end | |
push!(res, '_') | |
else | |
push!(res, c) | |
ind = nextind(s, ind) | |
if ind <= n | |
c = s[ind] | |
end | |
end | |
end | |
return CharString(res) | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
include("src.jl") | |
using Base.Test | |
# 1-character | |
@test isidentifier("a") == true | |
@test isidentifier("1") == false | |
@test isidentifier("!") == false | |
@test isidentifier("_") == true | |
# 2-characters, initial alphabetical character | |
@test isidentifier("aa") == true | |
@test isidentifier("a1") == true | |
@test isidentifier("a!") == true | |
@test isidentifier("a_") == true | |
# 2-characters, initial underscore | |
@test isidentifier("_a") == true | |
@test isidentifier("_1") == true | |
@test isidentifier("_!") == true | |
@test isidentifier("__") == true | |
# 3-characters, initial alphabetical character | |
@test isidentifier("aaa") == true | |
@test isidentifier("aa1") == true | |
@test isidentifier("aa!") == true | |
@test isidentifier("aa_") == true | |
@test isidentifier("a1a") == true | |
@test isidentifier("a11") == true | |
@test isidentifier("a1!") == true | |
@test isidentifier("a1_") == true | |
@test isidentifier("a!a") == true | |
@test isidentifier("a!1") == true | |
@test isidentifier("a!!") == true | |
@test isidentifier("a!_") == true | |
@test isidentifier("a_a") == true | |
@test isidentifier("a_1") == true | |
@test isidentifier("a_!") == true | |
@test isidentifier("a__") == true | |
# 3-characters, initial underscore | |
@test isidentifier("_aa") == true | |
@test isidentifier("_a1") == true | |
@test isidentifier("_a!") == true | |
@test isidentifier("_a_") == true | |
@test isidentifier("_1a") == true | |
@test isidentifier("_11") == true | |
@test isidentifier("_1!") == true | |
@test isidentifier("_1_") == true | |
@test isidentifier("_!a") == true | |
@test isidentifier("_!1") == true | |
@test isidentifier("_!!") == true | |
@test isidentifier("_!_") == true | |
@test isidentifier("__a") == true | |
@test isidentifier("__1") == true | |
@test isidentifier("__!") == true | |
@test isidentifier("___") == true | |
@test makeidentifier("a") == "a" | |
@test makeidentifier("1") == "x1" | |
@test makeidentifier(" a") == "a" | |
@test makeidentifier(" 1") == "x1" | |
@test makeidentifier("a\t b?1???b!") == "a_b_1_b!" | |
@test makeidentifier("1\t b?1???b!!") == "x1_b_1_b!!" | |
@test makeidentifier(" a\t θ?!γγ1a") == "a_θ_!γγ1a" | |
@test makeidentifier(" 1\t θ?!γγ1a") == "x1_θ_!γγ1a" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
It makes sense that
makeidentifier("__1") == "__1"
sinceisidentifier("__1")
,and I also like the clean look of the stripped identifiers, though it seems a little inconsistent:
I think stripping underscores for friendlier identifiers makes sense.
I wouldn't have replaced leading underscores with 'x's because it adds more uncertainty about what the string was like before, but it's consistent with the practice of preferring to add 'x's in the first place, which brought me around.
Once this goes in
Base
orDataFrames
, I'll updatereadtable
.