Diff

util/json.lua @ 5565:6dd806829226

util.json: New, faster, stricter, more compliant JSON decoder. Now returns nil,err instead of throwing errors on invalid input.
author Waqas Hussain <waqas20@gmail.com>
date Tue, 07 May 2013 10:42:44 -0400
parent 5563:678867c552d1
child 5776:bd0ff8ae98a8
line wrap: on
line diff
--- a/util/json.lua	Tue May 07 10:41:03 2013 -0400
+++ b/util/json.lua	Tue May 07 10:42:44 2013 -0400
@@ -185,214 +185,177 @@
 -----------------------------------
 
 
-function json.decode(json)
-	json = json.." "; -- appending a space ensures valid json wouldn't touch EOF
-	local pos = 1;
-	local current = {};
-	local stack = {};
-	local ch, peek;
-	local function next()
-		ch = json:sub(pos, pos);
-		if ch == "" then error("Unexpected EOF"); end
-		pos = pos+1;
-		peek = json:sub(pos, pos);
-		return ch;
-	end
-	
-	local function skipwhitespace()
-		while ch and (ch == "\r" or ch == "\n" or ch == "\t" or ch == " ") do
-			next();
+local function _skip_whitespace(json, index)
+	return json:find("[^ \t\r\n]", index) or index; -- no need to check \r\n, we converted those to \t
+end
+local function _fixobject(obj)
+	local __array = obj.__array;
+	if __array then
+		obj.__array = nil;
+		for i,v in ipairs(__array) do
+			t_insert(obj, v);
 		end
 	end
-	local function skiplinecomment()
-		repeat next(); until not(ch) or ch == "\r" or ch == "\n";
-		skipwhitespace();
-	end
-	local function skipstarcomment()
-		next(); next(); -- skip '/', '*'
-		while peek and ch ~= "*" and peek ~= "/" do next(); end
-		if not peek then error("eof in star comment") end
-		next(); next(); -- skip '*', '/'
-		skipwhitespace();
-	end
-	local function skipstuff()
-		while true do
-			skipwhitespace();
-			if ch == "/" and peek == "*" then
-				skipstarcomment();
-			elseif ch == "/" and peek == "/" then
-				skiplinecomment();
+	local __hash = obj.__hash;
+	if __hash then
+		obj.__hash = nil;
+		local k;
+		for i,v in ipairs(__hash) do
+			if k ~= nil then
+				obj[k] = v; k = nil;
 			else
-				return;
+				k = v;
 			end
 		end
 	end
-	
-	local readvalue;
-	local function readarray()
-		local t = setmetatable({}, array_mt);
-		next(); -- skip '['
-		skipstuff();
-		if ch == "]" then next(); return t; end
-		t_insert(t, readvalue());
-		while true do
-			skipstuff();
-			if ch == "]" then next(); return t; end
-			if not ch then error("eof while reading array");
-			elseif ch == "," then next();
-			elseif ch then error("unexpected character in array, comma expected"); end
-			if not ch then error("eof while reading array"); end
-			t_insert(t, readvalue());
+	return obj;
+end
+local _readvalue, _readstring;
+local function _readobject(json, index)
+	local o = {};
+	while true do
+		local key, val;
+		index = _skip_whitespace(json, index + 1);
+		if json:byte(index) ~= 0x22 then -- "\""
+			if json:byte(index) == 0x7d then return o, index + 1; end -- "}"
+			return nil, "key expected";
 		end
-	end
-	
-	local function checkandskip(c)
-		local x = ch or "eof";
-		if x ~= c then error("unexpected "..x..", '"..c.."' expected"); end
-		next();
-	end
-	local function readliteral(lit, val)
-		for c in lit:gmatch(".") do
-			checkandskip(c);
-		end
-		return val;
+		key, index = _readstring(json, index);
+		if key == nil then return nil, index; end
+		index = _skip_whitespace(json, index);
+		if json:byte(index) ~= 0x3a then return nil, "colon expected"; end -- ":"
+		val, index = _readvalue(json, index + 1);
+		if val == nil then return nil, index; end
+		o[key] = val;
+		index = _skip_whitespace(json, index);
+		local b = json:byte(index);
+		if b == 0x7d then return _fixobject(o), index + 1; end -- "}"
+		if b ~= 0x2c then return nil, "object eof"; end -- ","
 	end
-	local function readstring()
-		local s = {};
-		checkandskip("\"");
-		while ch do
-			while ch and ch ~= "\\" and ch ~= "\"" do
-				t_insert(s, ch); next();
-			end
-			if ch == "\\" then
-				next();
-				if unescapes[ch] then
-					t_insert(s, unescapes[ch]);
-					next();
-				elseif ch == "u" then
-					local seq = "";
-					for i=1,4 do
-						next();
-						if not ch then error("unexpected eof in string"); end
-						if not ch:match("[0-9a-fA-F]") then error("invalid unicode escape sequence in string"); end
-						seq = seq..ch;
-					end
-					t_insert(s, codepoint_to_utf8(tonumber(seq, 16)));
-					next();
-				else error("invalid escape sequence in string"); end
-			end
-			if ch == "\"" then
-				next();
-				return t_concat(s);
-			end
-		end
-		error("eof while reading string");
-	end
-	local function readnumber()
-		local s = "";
-		if ch == "-" then
-			s = s..ch; next();
-			if not ch:match("[0-9]") then error("number format error"); end
-		end
-		if ch == "0" then
-			s = s..ch; next();
-			if ch:match("[0-9]") then error("number format error"); end
-		else
-			while ch and ch:match("[0-9]") do
-				s = s..ch; next();
-			end
+end
+local function _readarray(json, index)
+	local a = {};
+	local oindex = index;
+	while true do
+		local val;
+		val, index = _readvalue(json, index + 1);
+		if val == nil then
+			if json:byte(oindex + 1) == 0x5d then return setmetatable(a, array_mt), oindex + 2; end -- "]"
+			return val, index;
 		end
-		if ch == "." then
-			s = s..ch; next();
-			if not ch:match("[0-9]") then error("number format error"); end
-			while ch and ch:match("[0-9]") do
-				s = s..ch; next();
-			end
-			if ch == "e" or ch == "E" then
-				s = s..ch; next();
-				if ch == "+" or ch == "-" then
-					s = s..ch; next();
-					if not ch:match("[0-9]") then error("number format error"); end
-					while ch and ch:match("[0-9]") do
-						s = s..ch; next();
-					end
-				end
-			end
-		end
-		return tonumber(s);
+		t_insert(a, val);
+		index = _skip_whitespace(json, index);
+		local b = json:byte(index);
+		if b == 0x5d then return setmetatable(a, array_mt), index + 1; end -- "]"
+		if b ~= 0x2c then return nil, "array eof"; end -- ","
+	end
+end
+local _unescape_error;
+local function _unescape_surrogate_func(x)
+	local lead, trail = tonumber(x:sub(3, 6), 16), tonumber(x:sub(9, 12), 16);
+	local codepoint = lead * 0x400 + trail - 0x35FDC00;
+	local a = codepoint % 64;
+	codepoint = (codepoint - a) / 64;
+	local b = codepoint % 64;
+	codepoint = (codepoint - b) / 64;
+	local c = codepoint % 64;
+	codepoint = (codepoint - c) / 64;
+	return s_char(0xF0 + codepoint, 0x80 + c, 0x80 + b, 0x80 + a);
+end
+local function _unescape_func(x)
+	x = x:match("%x%x%x%x", 3);
+	if x then
+		--if x >= 0xD800 and x <= 0xDFFF then _unescape_error = true; end -- bad surrogate pair
+		return codepoint_to_utf8(tonumber(x, 16));
 	end
-	local function readmember(t)
-		skipstuff();
-		local k = readstring();
-		skipstuff();
-		checkandskip(":");
-		t[k] = readvalue();
+	_unescape_error = true;
+end
+function _readstring(json, index)
+	index = index + 1;
+	local endindex = json:find("\"", index, true);
+	if endindex then
+		local s = json:sub(index, endindex - 1);
+		--if s:find("[%z-\31]") then return nil, "control char in string"; end
+		-- FIXME handle control characters
+		_unescape_error = nil;
+		--s = s:gsub("\\u[dD][89abAB]%x%x\\u[dD][cdefCDEF]%x%x", _unescape_surrogate_func);
+		-- FIXME handle escapes beyond BMP
+		s = s:gsub("\\u.?.?.?.?", _unescape_func);
+		if _unescape_error then return nil, "invalid escape"; end
+		return s, endindex + 1;
+	end
+	return nil, "string eof";
+end
+local function _readnumber(json, index)
+	local m = json:match("[0-9%.%-eE%+]+", index); -- FIXME do strict checking
+	return tonumber(m), index + #m;
+end
+local function _readnull(json, index)
+	local a, b, c = json:byte(index + 1, index + 3);
+	if a == 0x75 and b == 0x6c and c == 0x6c then
+		return null, index + 4;
 	end
-	local function fixobject(obj)
-		local __array = obj.__array;
-		if __array then
-			obj.__array = nil;
-			for i,v in ipairs(__array) do
-				t_insert(obj, v);
-			end
-		end
-		local __hash = obj.__hash;
-		if __hash then
-			obj.__hash = nil;
-			local k;
-			for i,v in ipairs(__hash) do
-				if k ~= nil then
-					obj[k] = v; k = nil;
-				else
-					k = v;
-				end
-			end
-		end
-		return obj;
+	return nil, "null parse failed";
+end
+local function _readtrue(json, index)
+	local a, b, c = json:byte(index + 1, index + 3);
+	if a == 0x72 and b == 0x75 and c == 0x65 then
+		return true, index + 4;
+	end
+	return nil, "true parse failed";
+end
+local function _readfalse(json, index)
+	local a, b, c, d = json:byte(index + 1, index + 4);
+	if a == 0x61 and b == 0x6c and c == 0x73 and d == 0x65 then
+		return false, index + 5;
 	end
-	local function readobject()
-		local t = {};
-		next(); -- skip '{'
-		skipstuff();
-		if ch == "}" then next(); return t; end
-		if not ch then error("eof while reading object"); end
-		readmember(t);
-		while true do
-			skipstuff();
-			if ch == "}" then next(); return fixobject(t); end
-			if not ch then error("eof while reading object");
-			elseif ch == "," then next();
-			elseif ch then error("unexpected character in object, comma expected"); end
-			if not ch then error("eof while reading object"); end
-			readmember(t);
-		end
+	return nil, "false parse failed";
+end
+function _readvalue(json, index)
+	index = _skip_whitespace(json, index);
+	local b = json:byte(index);
+	-- TODO try table lookup instead of if-else?
+	if b == 0x7B then -- "{"
+		return _readobject(json, index);
+	elseif b == 0x5B then -- "["
+		return _readarray(json, index);
+	elseif b == 0x22 then -- "\""
+		return _readstring(json, index);
+	elseif b ~= nil and b >= 0x30 and b <= 0x39 or b == 0x2d then -- "0"-"9" or "-"
+		return _readnumber(json, index);
+	elseif b == 0x6e then -- "n"
+		return _readnull(json, index);
+	elseif b == 0x74 then -- "t"
+		return _readtrue(json, index);
+	elseif b == 0x66 then -- "f"
+		return _readfalse(json, index);
+	else
+		return nil, "value expected";
 	end
+end
+local first_escape = {
+	["\\\""] = "\\u0022";
+	["\\\\"] = "\\u005c";
+	["\\/" ] = "\\u002f";
+	["\\b" ] = "\\u0008";
+	["\\f" ] = "\\u000C";
+	["\\n" ] = "\\u000A";
+	["\\r" ] = "\\u000D";
+	["\\t" ] = "\\u0009";
+	["\\u" ] = "\\u";
+};
+
+function json.decode(json)
+	json = json:gsub("\\.", first_escape) -- get rid of all escapes except \uXXXX, making string parsing much simpler
+		--:gsub("[\r\n]", "\t"); -- \r\n\t are equivalent, we care about none of them, and none of them can be in strings
 	
-	function readvalue()
-		skipstuff();
-		while ch do
-			if ch == "{" then
-				return readobject();
-			elseif ch == "[" then
-				return readarray();
-			elseif ch == "\"" then
-				return readstring();
-			elseif ch:match("[%-0-9%.]") then
-				return readnumber();
-			elseif ch == "n" then
-				return readliteral("null", null);
-			elseif ch == "t" then
-				return readliteral("true", true);
-			elseif ch == "f" then
-				return readliteral("false", false);
-			else
-				error("invalid character at value start: "..ch);
-			end
-		end
-		error("eof while reading value");
-	end
-	next();
-	return readvalue();
+	-- TODO do encoding verification
+	
+	local val, index = _readvalue(json, 1);
+	if val == nil then return val, index; end
+	if json:find("[^ \t\r\n]", index) then return nil, "garbage at eof"; end
+
+	return val;
 end
 
 function json.test(object)