Fixed some case-folding and added Table A.1 for IDNA (#42).
This commit is contained in:
parent
c09de16347
commit
f955dca417
@ -2,14 +2,12 @@
|
||||
|
||||
import { toUtf8CodePoints, _toUtf8String, UnicodeNormalizationForm } from "./utf8";
|
||||
|
||||
let _tmp = 0;
|
||||
|
||||
type Ranged = {
|
||||
l: number,
|
||||
h: number,
|
||||
d?: number,
|
||||
s?: number,
|
||||
e?: Array<number>
|
||||
l: number, // Lo value
|
||||
h: number, // High value (less the lo)
|
||||
d?: number, // Delta/stride (default: 1)
|
||||
s?: number, // Shift (default: 1)
|
||||
e?: Array<number> // Exceptions to skip
|
||||
};
|
||||
|
||||
type Table = { [ src: number ]: Array<number> };
|
||||
@ -40,6 +38,37 @@ function createTable(data: string, func?: (value: string) => Array<number>): Tab
|
||||
return result;
|
||||
}
|
||||
|
||||
function createRangeTable(data: string): Array<Ranged> {
|
||||
let hi = 0;
|
||||
return data.split(",").map((v) => {
|
||||
let comps = v.split("-");
|
||||
if (comps.length === 1) {
|
||||
comps[1] = "0";
|
||||
} else if (comps[1] === "") {
|
||||
comps[1] = "1";
|
||||
}
|
||||
|
||||
let lo = hi + parseInt(comps[0], 16);
|
||||
hi = parseInt(comps[1], 16);
|
||||
return { l: lo, h: hi };
|
||||
});
|
||||
}
|
||||
|
||||
function matchMap(value: number, ranges: Array<Ranged>): Ranged {
|
||||
let lo = 0;
|
||||
for (let i = 0; i < ranges.length; i++) {
|
||||
let range = ranges[i];
|
||||
lo += range.l;
|
||||
if (value >= lo && value <= lo + range.h && ((value - lo) % (range.d || 1)) === 0) {
|
||||
if (range.e && range.e.indexOf(value - lo) !== -1) { continue; }
|
||||
return range;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
const Table_A_1_ranges = createRangeTable("221,13-1b,5f-,40-10,51-f,11-3,3-3,2-2,2-4,8,2,15,2d,28-8,88,48,27-,3-5,11-20,27-,8,28,3-5,12,18,b-a,1c-4,6-16,2-d,2-2,2,1b-4,17-9,8f-,10,f,1f-2,1c-34,33-14e,4,36-,13-,6-2,1a-f,4,9-,3-,17,8,2-2,5-,2,8-,3-,4-8,2-3,3,6-,16-6,2-,7-3,3-,17,8,3,3,3-,2,6-3,3-,4-a,5,2-6,10-b,4,8,2,4,17,8,3,6-,b,4,4-,2-e,2-4,b-10,4,9-,3-,17,8,3-,5-,9-2,3-,4-7,3-3,3,4-3,c-10,3,7-2,4,5-2,3,2,3-2,3-2,4-2,9,4-3,6-2,4,5-8,2-e,d-d,4,9,4,18,b,6-3,8,4,5-6,3-8,3-3,b-11,3,9,4,18,b,6-3,8,4,5-6,3-6,2,3-3,b-11,3,9,4,18,11-3,7-,4,5-8,2-7,3-3,b-11,3,13-2,19,a,2-,8-2,2-3,7,2,9-11,4-b,3b-3,1e-24,3,2-,3,2-,2-5,5,8,4,2,2-,3,e,4-,6,2,7-,b-,3-21,49,23-5,1c-3,9,25,10-,2-2f,23,6,3,8-2,5-5,1b-45,27-9,2a-,2-3,5b-4,45-4,53-5,8,40,2,5-,8,2,5-,28,2,5-,20,2,5-,8,2,5-,8,8,18,20,2,5-,8,28,14-5,1d-22,56-b,277-8,1e-2,52-e,e,8-a,18-8,15-b,e,4,3-b,5e-2,b-15,10,b-5,59-7,2b-555,9d-3,5b-5,17-,7-,27-,7-,9,2,2,2,20-,36,10,f-,7,14-,4,a,54-3,2-6,6-5,9-,1c-10,13-1d,1c-14,3c-,10-6,32-b,240-30,28-18,c-14,a0,115-,3,66-,b-76,5,5-,1d,24,2,5-2,2,8-,35-2,19,f-10,1d-3,311-37f,1b,5a-b,d7-19,d-3,41,57-,68-4,29-3,5f,29-37,2e-2,25-c,2c-2,4e-3,30,78-3,64-,20,19b7-49,51a7-59,48e-2,38-738,2ba5-5b,222f-,3c-94,8-b,6-4,1b,6,2,3,3,6d-20,16e-f,41-,37-7,2e-2,11-f,5-b,18-,b,14,5-3,6,88-,2,bf-2,7-,7-,7-,4-2,8,8-9,8-2ff,20,5-b,1c-b4,27-,27-cbb1,f7-9,28-2,b5-221,56,48,3-,2-,3-,5,d,2,5,3,42,5-,9,8,1d,5,6,2-2,8,153-3,123-3,33-27fd,a6da-5128,21f-5df,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3-fffd,3,2-1d,61-ff7d");
|
||||
|
||||
// @TODO: Make this relative...
|
||||
const Table_B_1_flags = "ad,34f,1806,180b,180c,180d,200b,200c,200d,2060,feff".split(",").map((v) => parseInt(v, 16));
|
||||
|
||||
@ -91,31 +120,8 @@ const Table_B_2_lut_abs = createTable("b5:3bc,c3:ff,7:73,2:253,5:254,3:256,1:257
|
||||
const Table_B_2_lut_rel = createTable("179:1,2:1,2:1,5:1,2:1,a:4f,a:1,8:1,2:1,2:1,3:1,5:1,3:1,4:1,2:1,3:1,4:1,8:2,1:1,2:2,1:1,2:2,27:2,195:26,2:25,1:25,1:25,2:40,2:3f,1:3f,33:1,11:-6,1:-9,1ac7:-3a,6d:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,9:-8,1:-8,1:-8,1:-8,1:-8,1:-8,b:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,9:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,9:-8,1:-8,1:-8,1:-8,1:-8,1:-8,c:-8,2:-8,2:-8,2:-8,9:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,1:-8,49:-8,1:-8,1:-4a,1:-4a,d:-56,1:-56,1:-56,1:-56,d:-8,1:-8,f:-8,1:-8,3:-7");
|
||||
const Table_B_2_complex = createTable("df:00730073,51:00690307,19:02BC006E,a7:006A030C,18a:002003B9,16:03B903080301,20:03C503080301,1d7:05650582,190f:00680331,1:00740308,1:0077030A,1:0079030A,1:006102BE,b6:03C50313,2:03C503130300,2:03C503130301,2:03C503130342,2a:1F0003B9,1:1F0103B9,1:1F0203B9,1:1F0303B9,1:1F0403B9,1:1F0503B9,1:1F0603B9,1:1F0703B9,1:1F0003B9,1:1F0103B9,1:1F0203B9,1:1F0303B9,1:1F0403B9,1:1F0503B9,1:1F0603B9,1:1F0703B9,1:1F2003B9,1:1F2103B9,1:1F2203B9,1:1F2303B9,1:1F2403B9,1:1F2503B9,1:1F2603B9,1:1F2703B9,1:1F2003B9,1:1F2103B9,1:1F2203B9,1:1F2303B9,1:1F2403B9,1:1F2503B9,1:1F2603B9,1:1F2703B9,1:1F6003B9,1:1F6103B9,1:1F6203B9,1:1F6303B9,1:1F6403B9,1:1F6503B9,1:1F6603B9,1:1F6703B9,1:1F6003B9,1:1F6103B9,1:1F6203B9,1:1F6303B9,1:1F6403B9,1:1F6503B9,1:1F6603B9,1:1F6703B9,3:1F7003B9,1:03B103B9,1:03AC03B9,2:03B10342,1:03B1034203B9,5:03B103B9,6:1F7403B9,1:03B703B9,1:03AE03B9,2:03B70342,1:03B7034203B9,5:03B703B9,6:03B903080300,1:03B903080301,3:03B90342,1:03B903080342,b:03C503080300,1:03C503080301,1:03C10313,2:03C50342,1:03C503080342,b:1F7C03B9,1:03C903B9,1:03CE03B9,2:03C90342,1:03C9034203B9,5:03C903B9,ac:00720073,5b:00B00063,6:00B00066,d:006E006F,a:0073006D,1:00740065006C,1:0074006D,124f:006800700061,2:00610075,2:006F0076,b:00700061,1:006E0061,1:03BC0061,1:006D0061,1:006B0061,1:006B0062,1:006D0062,1:00670062,3:00700066,1:006E0066,1:03BC0066,4:0068007A,1:006B0068007A,1:006D0068007A,1:00670068007A,1:00740068007A,15:00700061,1:006B00700061,1:006D00700061,1:006700700061,8:00700076,1:006E0076,1:03BC0076,1:006D0076,1:006B0076,1:006D0076,1:00700077,1:006E0077,1:03BC0077,1:006D0077,1:006B0077,1:006D0077,1:006B03C9,1:006D03C9,2:00620071,3:00632215006B0067,1:0063006F002E,1:00640062,1:00670079,2:00680070,2:006B006B,1:006B006D,9:00700068,2:00700070006D,1:00700072,2:00730076,1:00770062,c723:00660066,1:00660069,1:0066006C,1:006600660069,1:00660066006C,1:00730074,1:00730074,d:05740576,1:05740565,1:0574056B,1:057E0576,1:0574056D", bytes2);
|
||||
|
||||
_tmp = 0;
|
||||
const Table_C_flags = "70f,f71,18e".split(",").map((v) => {
|
||||
_tmp += parseInt(v, 16);
|
||||
return _tmp;
|
||||
});
|
||||
_tmp = 0;
|
||||
const Table_C_ranges = "80-20,2c0,1cc0-f,28-7,37-4,b-5,f86-b,a810-20ff,25d0-1f,229-6,d17a-7,2e8b,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,10000,22-5f".split(",").map((v) => {
|
||||
let comps = v.split("-");
|
||||
if (comps.length === 1) { comps[1] = "1"; }
|
||||
_tmp += parseInt(comps[0], 16);
|
||||
return { l: _tmp, h: parseInt(comps[1], 16) }
|
||||
});
|
||||
const Table_C_ranges = createRangeTable("80-20,2a0-,39c,32,f71,18e,7f2-f,19-7,30-4,7-5,f81-b,5,a800-20ff,4d1-1f,110,fa-6,d174-7,2e84-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,ffff-,2,1f-5f,ff7f-20001");
|
||||
|
||||
function matchMap(value: number, ranges: Array<Ranged>): Ranged {
|
||||
let lo = 0;
|
||||
for (let i = 0; i < ranges.length; i++) {
|
||||
let range = ranges[i];
|
||||
lo += range.l;
|
||||
if (value >= lo && value <= lo + range.h && ((value - lo) % (range.d || 1)) === 0) {
|
||||
if (range.e && range.e.indexOf(value - lo) !== -1) { continue; }
|
||||
return range;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function flatten(values: Array<Array<number>>): Array<number> {
|
||||
return values.reduce((accum, value) => {
|
||||
@ -124,20 +130,28 @@ function flatten(values: Array<Array<number>>): Array<number> {
|
||||
}, [ ]);
|
||||
}
|
||||
|
||||
export function _nameprepTableA1(codepoint: number): boolean {
|
||||
return !!matchMap(codepoint, Table_A_1_ranges);
|
||||
}
|
||||
|
||||
export function _nameprepTableB2(codepoint: number): Array<number> {
|
||||
let match = matchMap(codepoint, Table_B_2_ranges);
|
||||
if (match) { return [ codepoint + match.s ]; }
|
||||
let range = matchMap(codepoint, Table_B_2_ranges);
|
||||
if (range) { return [ codepoint + range.s ]; }
|
||||
|
||||
let codes = Table_B_2_lut_abs[codepoint];
|
||||
if (codes) { return codes; }
|
||||
let codes = Table_B_2_lut_abs[codepoint];
|
||||
if (codes) { return codes; }
|
||||
|
||||
let shift = Table_B_2_lut_rel[codepoint];
|
||||
if (shift) { return [ codepoint + shift[0] ]; }
|
||||
let shift = Table_B_2_lut_rel[codepoint];
|
||||
if (shift) { return [ codepoint + shift[0] ]; }
|
||||
|
||||
let complex = Table_B_2_complex[codepoint];
|
||||
if (complex) { return complex; }
|
||||
let complex = Table_B_2_complex[codepoint];
|
||||
if (complex) { return complex; }
|
||||
|
||||
return null;
|
||||
return null;
|
||||
}
|
||||
|
||||
export function _nameprepTableC(codepoint: number): boolean {
|
||||
return !!matchMap(codepoint, Table_C_ranges);
|
||||
}
|
||||
|
||||
export function nameprep(value: string): string {
|
||||
@ -145,7 +159,7 @@ export function nameprep(value: string): string {
|
||||
// This allows platforms with incomplete normalize to bypass
|
||||
// it for very basic names which the built-in toLowerCase
|
||||
// will certainly handle correctly
|
||||
if (value.match(/^[a-z0-9-]*$/i)) { return value.toLowerCase(); }
|
||||
if (value.match(/^[a-z0-9-]*$/i) && value.length <= 59) { return value.toLowerCase(); }
|
||||
|
||||
// Get the code points (keeping the current normalization)
|
||||
let codes = toUtf8CodePoints(value);
|
||||
@ -163,17 +177,21 @@ export function nameprep(value: string): string {
|
||||
return [ code ];
|
||||
}));
|
||||
|
||||
// Normalize using fomr KC
|
||||
// Normalize using form KC
|
||||
codes = toUtf8CodePoints(_toUtf8String(codes), UnicodeNormalizationForm.NFKC);
|
||||
|
||||
// Prohibit C.1.2, C.2.2, C.3, C.4, C.5, C.6, C.7, C.8, C.9
|
||||
// Prohibit Tables C.1.2, C.2.2, C.3, C.4, C.5, C.6, C.7, C.8, C.9
|
||||
codes.forEach((code) => {
|
||||
if (Table_C_flags.indexOf(code) >= 0) { throw new Error("invalid character code"); }
|
||||
Table_C_ranges.forEach((range) => {
|
||||
if (code >= range.l && code <= range.l + range.h) {
|
||||
throw new Error("STRINGPREP_CONTAINS_PROHIBITED");
|
||||
}
|
||||
});
|
||||
if (_nameprepTableC(code)) {
|
||||
throw new Error("STRINGPREP_CONTAINS_PROHIBITED");
|
||||
}
|
||||
});
|
||||
|
||||
// Prohibit Unassigned Code Points (Table A.1)
|
||||
codes.forEach((code) => {
|
||||
if (_nameprepTableA1(code)) {
|
||||
throw new Error("STRINGPREP_CONTAINS_UNASSIGNED");
|
||||
}
|
||||
});
|
||||
|
||||
// IDNA extras
|
||||
@ -187,6 +205,8 @@ export function nameprep(value: string): string {
|
||||
// IDNA: 4.2.4
|
||||
if (name.length > 63) { throw new Error("too long"); }
|
||||
|
||||
|
||||
|
||||
return name;
|
||||
}
|
||||
|
||||
|
46
packages/testcases/input/nameprep/extract-tests.py
Normal file
46
packages/testcases/input/nameprep/extract-tests.py
Normal file
@ -0,0 +1,46 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
output = ""
|
||||
for line in file("test-vectors-00.txt"):
|
||||
line = line.strip()
|
||||
if line == "" or line[0:1] == "#":
|
||||
continue
|
||||
if line.startswith("Josefsson") or line.startswith("Internet-Draft"):
|
||||
continue
|
||||
output += line.replace("\n", "")
|
||||
|
||||
Tests = [ ]
|
||||
|
||||
def get_byte(v):
|
||||
if len(v) == 1:
|
||||
return ord(v)
|
||||
return int(v[2:4], 16)
|
||||
|
||||
def get_string(value):
|
||||
value = value.strip()
|
||||
if value[0] == '"' and value[-1] == '"':
|
||||
return map(get_byte, re.findall("(\\\\x[0-9a-fA-F]{2}|.)", value[1:-1].replace('""', '')))
|
||||
if value.lower() == "null":
|
||||
return None
|
||||
raise Exception("unhandled")
|
||||
|
||||
Tests = [ ]
|
||||
|
||||
matches = re.findall("({(?:.|\n)*?})", output)
|
||||
for m in matches:
|
||||
comps = m[1:-1].split(",")
|
||||
test = dict(
|
||||
comment = comps[0].strip()[1:-1],
|
||||
input = get_string(comps[1]),
|
||||
output = get_string(comps[2])
|
||||
)
|
||||
if len(comps) >= 4:
|
||||
test["profile"] = get_string(comps[3])
|
||||
if len(comps) >= 5:
|
||||
test["flags"] = comps[4].strip()
|
||||
if len(comps) >= 6:
|
||||
test["rc"] = comps[5].strip()
|
||||
Tests.append(test)
|
||||
|
||||
print json.dumps(Tests)
|
@ -149,17 +149,28 @@ for delta in deltas:
|
||||
mappings.append(data)
|
||||
debug[data["l"]] = "MAP:" + str(data)
|
||||
|
||||
# Create complex table (things that map to more than one byte)
|
||||
complex = { }
|
||||
complex_output = [ ];
|
||||
for (src, dst, reason) in weird:
|
||||
for word in dst.split(" "):
|
||||
complex_output.append(int(word, 16))
|
||||
if len(word) != 4: raise Exception("hmmm")
|
||||
complex[int(src, 16)] = dst.replace(" ", "")
|
||||
|
||||
# Experimenting: We can easily create a LUT for the individual
|
||||
# components, as there is substantial overlap.
|
||||
#complex_output = dict((x, True) for x in complex_output).keys()
|
||||
#complex_output.sort()
|
||||
#print "COM", complex_output, len(complex_output)
|
||||
|
||||
# Sort mappings by lo
|
||||
mappings.sort(lambda a, b: cmp(a["l"], b["l"]))
|
||||
|
||||
debug_keys = debug.keys()
|
||||
debug_keys.sort()
|
||||
for d in debug_keys:
|
||||
print d, debug[d]
|
||||
#debug_keys = debug.keys()
|
||||
#debug_keys.sort()
|
||||
#for d in debug_keys:
|
||||
# print d, debug[d]
|
||||
|
||||
#print mappings
|
||||
|
||||
|
@ -1,3 +1,6 @@
|
||||
def hexify(v):
|
||||
return hex(v)[2:]
|
||||
|
||||
prohibit = [ ]
|
||||
|
||||
table = None
|
||||
@ -28,42 +31,27 @@ print prohibit
|
||||
prohibit = list(dict([(p, True) for p in prohibit]).keys())
|
||||
prohibit.sort()
|
||||
|
||||
prohibit_single = [ ]
|
||||
prohibit_range = [ ]
|
||||
output = [ dict(lo = prohibit[0], hi = prohibit[0]) ]
|
||||
|
||||
last_range_start = None
|
||||
last = 0
|
||||
for p in prohibit:
|
||||
if p - 1 == last:
|
||||
if last_range_start is None:
|
||||
last_range_start = last
|
||||
if len(prohibit_single) > 0 and prohibit_single[-1] == last:
|
||||
prohibit_single.pop()
|
||||
for p in prohibit[1:]:
|
||||
if p - 1 == output[-1]["hi"]:
|
||||
output[-1]["hi"] = p
|
||||
else:
|
||||
if last_range_start is not None:
|
||||
print "Range", last_range_start, last - last_range_start, hex(last_range_start)
|
||||
length = last - last_range_start
|
||||
if length == 1:
|
||||
length = ""
|
||||
else:
|
||||
length = "-" + hex(length)[2:]
|
||||
prohibit_range.append([ last_range_start, length ])
|
||||
last_range_start = None
|
||||
else:
|
||||
print "Single", p, hex(p)
|
||||
prohibit_single.append(p)
|
||||
last = p
|
||||
output.append(dict(lo = p, hi = p))
|
||||
|
||||
print output
|
||||
|
||||
last = 0
|
||||
for i in xrange(0, len(prohibit_single)):
|
||||
v = prohibit_single[i]
|
||||
prohibit_single[i] -= last
|
||||
last = v
|
||||
print 'const Table_C_lut = "' + ",".join(hex(x)[2:] for x in prohibit_single) + '";'
|
||||
for r in output:
|
||||
r["h"] = r["hi"] - r["lo"]
|
||||
r["l"] = r["lo"] - last
|
||||
last = r["hi"]
|
||||
|
||||
r["range"] = hexify(r["l"])
|
||||
if r["h"] > 1:
|
||||
r["range"] += "-" + hexify(r["h"])
|
||||
elif r["h"] > 0:
|
||||
r["range"] += "-"
|
||||
|
||||
print 'const Table_C_ranges = "' + ",".join(x["range"] for x in output) + '";'
|
||||
|
||||
last = 0
|
||||
for item in prohibit_range:
|
||||
v = item[0]
|
||||
item[0] -= last
|
||||
last = v
|
||||
print 'const Table_C_ranges = "' + ",".join(("%s%s" % (hex(p[0])[2:], p[1])) for p in prohibit_range) + '";';
|
||||
|
File diff suppressed because it is too large
Load Diff
5099
packages/testcases/input/nameprep/rfc/rfc3454.txt
Normal file
5099
packages/testcases/input/nameprep/rfc/rfc3454.txt
Normal file
File diff suppressed because it is too large
Load Diff
395
packages/testcases/input/nameprep/rfc/rfc3491.txt
Normal file
395
packages/testcases/input/nameprep/rfc/rfc3491.txt
Normal file
@ -0,0 +1,395 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Network Working Group P. Hoffman
|
||||
Request for Comments: 3491 IMC & VPNC
|
||||
Category: Standards Track M. Blanchet
|
||||
Viagenie
|
||||
March 2003
|
||||
|
||||
|
||||
Nameprep: A Stringprep Profile for
|
||||
Internationalized Domain Names (IDN)
|
||||
|
||||
Status of this Memo
|
||||
|
||||
This document specifies an Internet standards track protocol for the
|
||||
Internet community, and requests discussion and suggestions for
|
||||
improvements. Please refer to the current edition of the "Internet
|
||||
Official Protocol Standards" (STD 1) for the standardization state
|
||||
and status of this protocol. Distribution of this memo is unlimited.
|
||||
|
||||
Copyright Notice
|
||||
|
||||
Copyright (C) The Internet Society (2003). All Rights Reserved.
|
||||
|
||||
Abstract
|
||||
|
||||
This document describes how to prepare internationalized domain name
|
||||
(IDN) labels in order to increase the likelihood that name input and
|
||||
name comparison work in ways that make sense for typical users
|
||||
throughout the world. This profile of the stringprep protocol is
|
||||
used as part of a suite of on-the-wire protocols for
|
||||
internationalizing the Domain Name System (DNS).
|
||||
|
||||
1. Introduction
|
||||
|
||||
This document specifies processing rules that will allow users to
|
||||
enter internationalized domain names (IDNs) into applications and
|
||||
have the highest chance of getting the content of the strings
|
||||
correct. It is a profile of stringprep [STRINGPREP]. These
|
||||
processing rules are only intended for internationalized domain
|
||||
names, not for arbitrary text.
|
||||
|
||||
This profile defines the following, as required by [STRINGPREP].
|
||||
|
||||
- The intended applicability of the profile: internationalized
|
||||
domain names processed by IDNA.
|
||||
|
||||
- The character repertoire that is the input and output to
|
||||
stringprep: Unicode 3.2, specified in section 2.
|
||||
|
||||
|
||||
|
||||
|
||||
Hoffman & Blanchet Standards Track [Page 1]
|
||||
|
||||
RFC 3491 IDN Nameprep March 2003
|
||||
|
||||
|
||||
- The mappings used: specified in section 3.
|
||||
|
||||
- The Unicode normalization used: specified in section 4.
|
||||
|
||||
- The characters that are prohibited as output: specified in section
|
||||
5.
|
||||
|
||||
- Bidirectional character handling: specified in section 6.
|
||||
|
||||
1.1 Interaction of protocol parts
|
||||
|
||||
Nameprep is used by the IDNA [IDNA] protocol for preparing domain
|
||||
names; it is not designed for any other purpose. It is explicitly
|
||||
not designed for processing arbitrary free text and SHOULD NOT be
|
||||
used for that purpose. Nameprep is a profile of Stringprep
|
||||
[STRINGPREP]. Implementations of Nameprep MUST fully implement
|
||||
Stringprep.
|
||||
|
||||
Nameprep is used to process domain name labels, not domain names.
|
||||
IDNA calls nameprep for each label in a domain name, not for the
|
||||
whole domain name.
|
||||
|
||||
1.2 Terminology
|
||||
|
||||
The key words "MUST", "MUST NOT", "SHOULD", "SHOULD NOT", and "MAY"
|
||||
in this document are to be interpreted as described in BCP 14, RFC
|
||||
2119 [RFC2119].
|
||||
|
||||
2. Character Repertoire
|
||||
|
||||
This profile uses Unicode 3.2, as defined in [STRINGPREP] Appendix A.
|
||||
|
||||
3. Mapping
|
||||
|
||||
This profile specifies mapping using the following tables from
|
||||
[STRINGPREP]:
|
||||
|
||||
Table B.1
|
||||
Table B.2
|
||||
|
||||
4. Normalization
|
||||
|
||||
This profile specifies using Unicode normalization form KC, as
|
||||
described in [STRINGPREP].
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Hoffman & Blanchet Standards Track [Page 2]
|
||||
|
||||
RFC 3491 IDN Nameprep March 2003
|
||||
|
||||
|
||||
5. Prohibited Output
|
||||
|
||||
This profile specifies prohibiting using the following tables from
|
||||
[STRINGPREP]:
|
||||
|
||||
Table C.1.2
|
||||
Table C.2.2
|
||||
Table C.3
|
||||
Table C.4
|
||||
Table C.5
|
||||
Table C.6
|
||||
Table C.7
|
||||
Table C.8
|
||||
Table C.9
|
||||
|
||||
IMPORTANT NOTE: This profile MUST be used with the IDNA protocol.
|
||||
The IDNA protocol has additional prohibitions that are checked
|
||||
outside of this profile.
|
||||
|
||||
6. Bidirectional characters
|
||||
|
||||
This profile specifies checking bidirectional strings as described in
|
||||
[STRINGPREP] section 6.
|
||||
|
||||
7. Unassigned Code Points in Internationalized Domain Names
|
||||
|
||||
If the processing in [IDNA] specifies that a list of unassigned code
|
||||
points be used, the system uses table A.1 from [STRINGPREP] as its
|
||||
list of unassigned code points.
|
||||
|
||||
8. References
|
||||
|
||||
8.1 Normative References
|
||||
|
||||
[RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
|
||||
Requirement Levels", BCP 14, RFC 2119, March 1997.
|
||||
|
||||
[STRINGPREP] Hoffman, P. and M. Blanchet, "Preparation of
|
||||
Internationalized Strings ("stringprep")", RFC 3454,
|
||||
December 2002.
|
||||
|
||||
[IDNA] Faltstrom, P., Hoffman, P. and A. Costello,
|
||||
"Internationalizing Domain Names in Applications
|
||||
(IDNA)", RFC 3490, March 2003.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Hoffman & Blanchet Standards Track [Page 3]
|
||||
|
||||
RFC 3491 IDN Nameprep March 2003
|
||||
|
||||
|
||||
8.2 Informative references
|
||||
|
||||
[STD13] Mockapetris, P., "Domain names - concepts and
|
||||
facilities", STD 13, RFC 1034, and "Domain names -
|
||||
implementation and specification", STD 13, RFC 1035,
|
||||
November 1987.
|
||||
|
||||
9. Security Considerations
|
||||
|
||||
The Unicode and ISO/IEC 10646 repertoires have many characters that
|
||||
look similar. In many cases, users of security protocols might do
|
||||
visual matching, such as when comparing the names of trusted third
|
||||
parties. Because it is impossible to map similar-looking characters
|
||||
without a great deal of context such as knowing the fonts used,
|
||||
stringprep does nothing to map similar-looking characters together
|
||||
nor to prohibit some characters because they look like others.
|
||||
|
||||
Security on the Internet partly relies on the DNS. Thus, any change
|
||||
to the characteristics of the DNS can change the security of much of
|
||||
the Internet.
|
||||
|
||||
Domain names are used by users to connect to Internet servers. The
|
||||
security of the Internet would be compromised if a user entering a
|
||||
single internationalized name could be connected to different servers
|
||||
based on different interpretations of the internationalized domain
|
||||
name.
|
||||
|
||||
Current applications might assume that the characters allowed in
|
||||
domain names will always be the same as they are in [STD13]. This
|
||||
document vastly increases the number of characters available in
|
||||
domain names. Every program that uses "special" characters in
|
||||
conjunction with domain names may be vulnerable to attack based on
|
||||
the new characters allowed by this specification.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Hoffman & Blanchet Standards Track [Page 4]
|
||||
|
||||
RFC 3491 IDN Nameprep March 2003
|
||||
|
||||
|
||||
10. IANA Considerations
|
||||
|
||||
This is a profile of stringprep. It has been registered by the IANA
|
||||
in the stringprep profile registry
|
||||
(www.iana.org/assignments/stringprep-profiles).
|
||||
|
||||
Name of this profile:
|
||||
Nameprep
|
||||
|
||||
RFC in which the profile is defined:
|
||||
This document.
|
||||
|
||||
Indicator whether or not this is the newest version of the
|
||||
profile:
|
||||
This is the first version of Nameprep.
|
||||
|
||||
11. Acknowledgements
|
||||
|
||||
Many people from the IETF IDN Working Group and the Unicode Technical
|
||||
Committee contributed ideas that went into this document.
|
||||
|
||||
The IDN Nameprep design team made many useful changes to the
|
||||
document. That team and its advisors include:
|
||||
|
||||
Asmus Freytag
|
||||
Cathy Wissink
|
||||
Francois Yergeau
|
||||
James Seng
|
||||
Marc Blanchet
|
||||
Mark Davis
|
||||
Martin Duerst
|
||||
Patrik Faltstrom
|
||||
Paul Hoffman
|
||||
|
||||
Additional significant improvements were proposed by:
|
||||
|
||||
Jonathan Rosenne
|
||||
Kent Karlsson
|
||||
Scott Hollenbeck
|
||||
Dave Crocker
|
||||
Erik Nordmark
|
||||
Matitiahu Allouche
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Hoffman & Blanchet Standards Track [Page 5]
|
||||
|
||||
RFC 3491 IDN Nameprep March 2003
|
||||
|
||||
|
||||
12. Authors' Addresses
|
||||
|
||||
Paul Hoffman
|
||||
Internet Mail Consortium and VPN Consortium
|
||||
127 Segre Place
|
||||
Santa Cruz, CA 95060 USA
|
||||
|
||||
EMail: paul.hoffman@imc.org and paul.hoffman@vpnc.org
|
||||
|
||||
|
||||
Marc Blanchet
|
||||
Viagenie inc.
|
||||
2875 boul. Laurier, bur. 300
|
||||
Ste-Foy, Quebec, Canada, G1V 2M2
|
||||
|
||||
EMail: Marc.Blanchet@viagenie.qc.ca
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Hoffman & Blanchet Standards Track [Page 6]
|
||||
|
||||
RFC 3491 IDN Nameprep March 2003
|
||||
|
||||
|
||||
13. Full Copyright Statement
|
||||
|
||||
Copyright (C) The Internet Society (2003). All Rights Reserved.
|
||||
|
||||
This document and translations of it may be copied and furnished to
|
||||
others, and derivative works that comment on or otherwise explain it
|
||||
or assist in its implementation may be prepared, copied, published
|
||||
and distributed, in whole or in part, without restriction of any
|
||||
kind, provided that the above copyright notice and this paragraph are
|
||||
included on all such copies and derivative works. However, this
|
||||
document itself may not be modified in any way, such as by removing
|
||||
the copyright notice or references to the Internet Society or other
|
||||
Internet organizations, except as needed for the purpose of
|
||||
developing Internet standards in which case the procedures for
|
||||
copyrights defined in the Internet Standards process must be
|
||||
followed, or as required to translate it into languages other than
|
||||
English.
|
||||
|
||||
The limited permissions granted above are perpetual and will not be
|
||||
revoked by the Internet Society or its successors or assigns.
|
||||
|
||||
This document and the information contained herein is provided on an
|
||||
"AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING
|
||||
TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING
|
||||
BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION
|
||||
HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF
|
||||
MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
|
||||
|
||||
Acknowledgement
|
||||
|
||||
Funding for the RFC Editor function is currently provided by the
|
||||
Internet Society.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Hoffman & Blanchet Standards Track [Page 7]
|
||||
|
1291
packages/testcases/input/nameprep/rfc/rfc5890.txt
Normal file
1291
packages/testcases/input/nameprep/rfc/rfc5890.txt
Normal file
File diff suppressed because it is too large
Load Diff
955
packages/testcases/input/nameprep/rfc/rfc5891.txt
Normal file
955
packages/testcases/input/nameprep/rfc/rfc5891.txt
Normal file
@ -0,0 +1,955 @@
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Internet Engineering Task Force (IETF) J. Klensin
|
||||
Request for Comments: 5891 August 2010
|
||||
Obsoletes: 3490, 3491
|
||||
Updates: 3492
|
||||
Category: Standards Track
|
||||
ISSN: 2070-1721
|
||||
|
||||
|
||||
Internationalized Domain Names in Applications (IDNA): Protocol
|
||||
|
||||
Abstract
|
||||
|
||||
This document is the revised protocol definition for
|
||||
Internationalized Domain Names (IDNs). The rationale for changes,
|
||||
the relationship to the older specification, and important
|
||||
terminology are provided in other documents. This document specifies
|
||||
the protocol mechanism, called Internationalized Domain Names in
|
||||
Applications (IDNA), for registering and looking up IDNs in a way
|
||||
that does not require changes to the DNS itself. IDNA is only meant
|
||||
for processing domain names, not free text.
|
||||
|
||||
Status of This Memo
|
||||
|
||||
This is an Internet Standards Track document.
|
||||
|
||||
This document is a product of the Internet Engineering Task Force
|
||||
(IETF). It represents the consensus of the IETF community. It has
|
||||
received public review and has been approved for publication by the
|
||||
Internet Engineering Steering Group (IESG). Further information on
|
||||
Internet Standards is available in Section 2 of RFC 5741.
|
||||
|
||||
Information about the current status of this document, any errata,
|
||||
and how to provide feedback on it may be obtained at
|
||||
http://www.rfc-editor.org/info/rfc5891.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 1]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
Copyright Notice
|
||||
|
||||
Copyright (c) 2010 IETF Trust and the persons identified as the
|
||||
document authors. All rights reserved.
|
||||
|
||||
This document is subject to BCP 78 and the IETF Trust's Legal
|
||||
Provisions Relating to IETF Documents
|
||||
(http://trustee.ietf.org/license-info) in effect on the date of
|
||||
publication of this document. Please review these documents
|
||||
carefully, as they describe your rights and restrictions with respect
|
||||
to this document. Code Components extracted from this document must
|
||||
include Simplified BSD License text as described in Section 4.e of
|
||||
the Trust Legal Provisions and are provided without warranty as
|
||||
described in the Simplified BSD License.
|
||||
|
||||
This document may contain material from IETF Documents or IETF
|
||||
Contributions published or made publicly available before November
|
||||
10, 2008. The person(s) controlling the copyright in some of this
|
||||
material may not have granted the IETF Trust the right to allow
|
||||
modifications of such material outside the IETF Standards Process.
|
||||
Without obtaining an adequate license from the person(s) controlling
|
||||
the copyright in such materials, this document may not be modified
|
||||
outside the IETF Standards Process, and derivative works of it may
|
||||
not be created outside the IETF Standards Process, except to format
|
||||
it for publication as an RFC or to translate it into languages other
|
||||
than English.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 2]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
Table of Contents
|
||||
|
||||
1. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . 4
|
||||
2. Terminology . . . . . . . . . . . . . . . . . . . . . . . . . 4
|
||||
3. Requirements and Applicability . . . . . . . . . . . . . . . . 5
|
||||
3.1. Requirements . . . . . . . . . . . . . . . . . . . . . . . 5
|
||||
3.2. Applicability . . . . . . . . . . . . . . . . . . . . . . 5
|
||||
3.2.1. DNS Resource Records . . . . . . . . . . . . . . . . . 6
|
||||
3.2.2. Non-Domain-Name Data Types Stored in the DNS . . . . . 6
|
||||
4. Registration Protocol . . . . . . . . . . . . . . . . . . . . 6
|
||||
4.1. Input to IDNA Registration . . . . . . . . . . . . . . . . 7
|
||||
4.2. Permitted Character and Label Validation . . . . . . . . . 7
|
||||
4.2.1. Input Format . . . . . . . . . . . . . . . . . . . . . 7
|
||||
4.2.2. Rejection of Characters That Are Not Permitted . . . . 8
|
||||
4.2.3. Label Validation . . . . . . . . . . . . . . . . . . . 8
|
||||
4.2.4. Registration Validation Requirements . . . . . . . . . 9
|
||||
4.3. Registry Restrictions . . . . . . . . . . . . . . . . . . 9
|
||||
4.4. Punycode Conversion . . . . . . . . . . . . . . . . . . . 9
|
||||
4.5. Insertion in the Zone . . . . . . . . . . . . . . . . . . 10
|
||||
5. Domain Name Lookup Protocol . . . . . . . . . . . . . . . . . 10
|
||||
5.1. Label String Input . . . . . . . . . . . . . . . . . . . . 10
|
||||
5.2. Conversion to Unicode . . . . . . . . . . . . . . . . . . 10
|
||||
5.3. A-label Input . . . . . . . . . . . . . . . . . . . . . . 10
|
||||
5.4. Validation and Character List Testing . . . . . . . . . . 11
|
||||
5.5. Punycode Conversion . . . . . . . . . . . . . . . . . . . 13
|
||||
5.6. DNS Name Resolution . . . . . . . . . . . . . . . . . . . 13
|
||||
6. Security Considerations . . . . . . . . . . . . . . . . . . . 13
|
||||
7. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 13
|
||||
8. Contributors . . . . . . . . . . . . . . . . . . . . . . . . . 13
|
||||
9. Acknowledgments . . . . . . . . . . . . . . . . . . . . . . . 14
|
||||
10. References . . . . . . . . . . . . . . . . . . . . . . . . . . 14
|
||||
10.1. Normative References . . . . . . . . . . . . . . . . . . . 14
|
||||
10.2. Informative References . . . . . . . . . . . . . . . . . . 15
|
||||
Appendix A. Summary of Major Changes from IDNA2003 . . . . . . . 17
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 3]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
1. Introduction
|
||||
|
||||
This document supplies the protocol definition for Internationalized
|
||||
Domain Names in Applications (IDNA), with the version specified here
|
||||
known as IDNA2008. Essential definitions and terminology for
|
||||
understanding this document and a road map of the collection of
|
||||
documents that make up IDNA2008 appear in a separate Definitions
|
||||
document [RFC5890]. Appendix A discusses the relationship between
|
||||
this specification and the earlier version of IDNA (referred to here
|
||||
as "IDNA2003"). The rationale for these changes, along with
|
||||
considerable explanatory material and advice to zone administrators
|
||||
who support IDNs, is provided in another document, known informally
|
||||
in this series as the "Rationale document" [RFC5894].
|
||||
|
||||
IDNA works by allowing applications to use certain ASCII [ASCII]
|
||||
string labels (beginning with a special prefix) to represent
|
||||
non-ASCII name labels. Lower-layer protocols need not be aware of
|
||||
this; therefore, IDNA does not change any infrastructure. In
|
||||
particular, IDNA does not depend on any changes to DNS servers,
|
||||
resolvers, or DNS protocol elements, because the ASCII name service
|
||||
provided by the existing DNS can be used for IDNA.
|
||||
|
||||
IDNA applies only to a specific subset of DNS labels. The base DNS
|
||||
standards [RFC1034] [RFC1035] and their various updates specify how
|
||||
to combine labels into fully-qualified domain names and parse labels
|
||||
out of those names.
|
||||
|
||||
This document describes two separate protocols, one for IDN
|
||||
registration (Section 4) and one for IDN lookup (Section 5). These
|
||||
two protocols share some terminology, reference data, and operations.
|
||||
|
||||
2. Terminology
|
||||
|
||||
As mentioned above, terminology used as part of the definition of
|
||||
IDNA appears in the Definitions document [RFC5890]. It is worth
|
||||
noting that some of this terminology overlaps with, and is consistent
|
||||
with, that used in Unicode or other character set standards and the
|
||||
DNS. Readers of this document are assumed to be familiar with the
|
||||
associated Definitions document and with the DNS-specific terminology
|
||||
in RFC 1034 [RFC1034].
|
||||
|
||||
The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
|
||||
"SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
|
||||
document are to be interpreted as described in BCP 14, RFC 2119
|
||||
[RFC2119].
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 4]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
3. Requirements and Applicability
|
||||
|
||||
3.1. Requirements
|
||||
|
||||
IDNA makes the following requirements:
|
||||
|
||||
1. Whenever a domain name is put into a domain name slot that is not
|
||||
IDNA-aware (see Section 2.3.2.6 of the Definitions document
|
||||
[RFC5890]), it MUST contain only ASCII characters (i.e., its
|
||||
labels must be either A-labels or NR-LDH labels), unless the DNS
|
||||
application is not subject to historical recommendations for
|
||||
"hostname"-style names (see RFC 1034 [RFC1034] and
|
||||
Section 3.2.1).
|
||||
|
||||
2. Labels MUST be compared using equivalent forms: either both
|
||||
A-label forms or both U-label forms. Because A-labels and
|
||||
U-labels can be transformed into each other without loss of
|
||||
information, these comparisons are equivalent (however, in
|
||||
practice, comparison of U-labels requires first verifying that
|
||||
they actually are U-labels and not just Unicode strings). A pair
|
||||
of A-labels MUST be compared as case-insensitive ASCII (as with
|
||||
all comparisons of ASCII DNS labels). U-labels MUST be compared
|
||||
as-is, without case folding or other intermediate steps. While
|
||||
it is not necessary to validate labels in order to compare them,
|
||||
successful comparison does not imply validity. In many cases,
|
||||
not limited to comparison, validation may be important for other
|
||||
reasons and SHOULD be performed.
|
||||
|
||||
3. Labels being registered MUST conform to the requirements of
|
||||
Section 4. Labels being looked up and the lookup process MUST
|
||||
conform to the requirements of Section 5.
|
||||
|
||||
3.2. Applicability
|
||||
|
||||
IDNA applies to all domain names in all domain name slots in
|
||||
protocols except where it is explicitly excluded. It does not apply
|
||||
to domain name slots that do not use the LDH syntax rules as
|
||||
described in the Definitions document [RFC5890].
|
||||
|
||||
Because it uses the DNS, IDNA applies to many protocols that were
|
||||
specified before it was designed. IDNs occupying domain name slots
|
||||
in those older protocols MUST be in A-label form until and unless
|
||||
those protocols and their implementations are explicitly upgraded to
|
||||
be aware of IDNs and to accept the U-label form. IDNs actually
|
||||
appearing in DNS queries or responses MUST be A-labels.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 5]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
IDNA-aware protocols and implementations MAY accept U-labels,
|
||||
A-labels, or both as those particular protocols specify. IDNA is not
|
||||
defined for extended label types (see RFC 2671 [RFC2671], Section 3).
|
||||
|
||||
3.2.1. DNS Resource Records
|
||||
|
||||
IDNA applies only to domain names in the NAME and RDATA fields of DNS
|
||||
resource records whose CLASS is IN. See the DNS specification
|
||||
[RFC1035] for precise definitions of these terms.
|
||||
|
||||
The application of IDNA to DNS resource records depends entirely on
|
||||
the CLASS of the record, and not on the TYPE except as noted below.
|
||||
This will remain true, even as new TYPEs are defined, unless a new
|
||||
TYPE defines TYPE-specific rules. Special naming conventions for SRV
|
||||
records (and "underscore labels" more generally) are incompatible
|
||||
with IDNA coding as discussed in the Definitions document [RFC5890],
|
||||
especially Section 2.3.2.3. Of course, underscore labels may be part
|
||||
of a domain that uses IDN labels at higher levels in the tree.
|
||||
|
||||
3.2.2. Non-Domain-Name Data Types Stored in the DNS
|
||||
|
||||
Although IDNA enables the representation of non-ASCII characters in
|
||||
domain names, that does not imply that IDNA enables the
|
||||
representation of non-ASCII characters in other data types that are
|
||||
stored in domain names, specifically in the RDATA field for types
|
||||
that have structured RDATA format. For example, an email address
|
||||
local part is stored in a domain name in the RNAME field as part of
|
||||
the RDATA of an SOA record (e.g., hostmaster@example.com would be
|
||||
represented as hostmaster.example.com). IDNA does not update the
|
||||
existing email standards, which allow only ASCII characters in local
|
||||
parts. Even though work is in progress to define
|
||||
internationalization for email addresses [RFC4952], changes to the
|
||||
email address part of the SOA RDATA would require action in, or
|
||||
updates to, other standards, specifically those that specify the
|
||||
format of the SOA RR.
|
||||
|
||||
4. Registration Protocol
|
||||
|
||||
This section defines the model for registering an IDN. The model is
|
||||
implementation independent; any sequence of steps that produces
|
||||
exactly the same result for all labels is considered a valid
|
||||
implementation.
|
||||
|
||||
Note that, while the registration (this section) and lookup protocols
|
||||
(Section 5) are very similar in most respects, they are not
|
||||
identical, and implementers should carefully follow the steps
|
||||
described in this specification.
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 6]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
4.1. Input to IDNA Registration
|
||||
|
||||
Registration processes, especially processing by entities (often
|
||||
called "registrars") who deal with registrants before the request
|
||||
actually reaches the zone manager ("registry") are outside the scope
|
||||
of this definition and may differ significantly depending on local
|
||||
needs. By the time a string enters the IDNA registration process as
|
||||
described in this specification, it MUST be in Unicode and in
|
||||
Normalization Form C (NFC [Unicode-UAX15]). Entities responsible for
|
||||
zone files ("registries") MUST accept only the exact string for which
|
||||
registration is requested, free of any mappings or local adjustments.
|
||||
They MAY accept that input in any of three forms:
|
||||
|
||||
1. As a pair of A-label and U-label.
|
||||
|
||||
2. As an A-label only.
|
||||
|
||||
3. As a U-label only.
|
||||
|
||||
The first two of these forms are RECOMMENDED because the use of
|
||||
A-labels avoids any possibility of ambiguity. The first is normally
|
||||
preferred over the second because it permits further verification of
|
||||
user intent (see Section 4.2.1).
|
||||
|
||||
4.2. Permitted Character and Label Validation
|
||||
|
||||
4.2.1. Input Format
|
||||
|
||||
If both the U-label and A-label forms are available, the registry
|
||||
MUST ensure that the A-label form is in lowercase, perform a
|
||||
conversion to a U-label, perform the steps and tests described below
|
||||
on that U-label, and then verify that the A-label produced by the
|
||||
step in Section 4.4 matches the one provided as input. In addition,
|
||||
the U-label that was provided as input and the one obtained by
|
||||
conversion of the A-label MUST match exactly. If, for some reason,
|
||||
these tests fail, the registration MUST be rejected.
|
||||
|
||||
If only an A-label was provided and the conversion to a U-label is
|
||||
not performed, the registry MUST still verify that the A-label is
|
||||
superficially valid, i.e., that it does not violate any of the rules
|
||||
of Punycode encoding [RFC3492] such as the prohibition on trailing
|
||||
hyphen-minus, the requirement that all characters be ASCII, and so
|
||||
on. Strings that appear to be A-labels (e.g., they start with
|
||||
"xn--") and strings that are supplied to the registry in a context
|
||||
reserved for A-labels (such as a field in a form to be filled out),
|
||||
but that are not valid A-labels as described in this paragraph, MUST
|
||||
NOT be placed in DNS zones that support IDNA.
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 7]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
If only an A-label is provided, the conversion to a U-label is not
|
||||
performed, but the superficial tests described in the previous
|
||||
paragraph are performed, registration procedures MAY, and usually
|
||||
will, bypass the tests and actions in the balance of Section 4.2 and
|
||||
in Sections 4.3 and 4.4.
|
||||
|
||||
4.2.2. Rejection of Characters That Are Not Permitted
|
||||
|
||||
The candidate Unicode string MUST NOT contain characters that appear
|
||||
in the "DISALLOWED" and "UNASSIGNED" lists specified in the Tables
|
||||
document [RFC5892].
|
||||
|
||||
4.2.3. Label Validation
|
||||
|
||||
The proposed label (in the form of a Unicode string, i.e., a string
|
||||
that at least superficially appears to be a U-label) is then examined
|
||||
using tests that require examination of more than one character.
|
||||
Character order is considered to be the on-the-wire order. That
|
||||
order may not be the same as the display order.
|
||||
|
||||
4.2.3.1. Hyphen Restrictions
|
||||
|
||||
The Unicode string MUST NOT contain "--" (two consecutive hyphens) in
|
||||
the third and fourth character positions and MUST NOT start or end
|
||||
with a "-" (hyphen).
|
||||
|
||||
4.2.3.2. Leading Combining Marks
|
||||
|
||||
The Unicode string MUST NOT begin with a combining mark or combining
|
||||
character (see The Unicode Standard, Section 2.11 [Unicode] for an
|
||||
exact definition).
|
||||
|
||||
4.2.3.3. Contextual Rules
|
||||
|
||||
The Unicode string MUST NOT contain any characters whose validity is
|
||||
context-dependent, unless the validity is positively confirmed by a
|
||||
contextual rule. To check this, each code point identified as
|
||||
CONTEXTJ or CONTEXTO in the Tables document [RFC5892] MUST have a
|
||||
non-null rule. If such a code point is missing a rule, the label is
|
||||
invalid. If the rule exists but the result of applying the rule is
|
||||
negative or inconclusive, the proposed label is invalid.
|
||||
|
||||
4.2.3.4. Labels Containing Characters Written Right to Left
|
||||
|
||||
If the proposed label contains any characters from scripts that are
|
||||
written from right to left, it MUST meet the Bidi criteria [RFC5893].
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 8]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
4.2.4. Registration Validation Requirements
|
||||
|
||||
Strings that contain at least one non-ASCII character, have been
|
||||
produced by the steps above, whose contents pass all of the tests in
|
||||
Section 4.2.3, and are 63 or fewer characters long in
|
||||
ASCII-compatible encoding (ACE) form (see Section 4.4), are U-labels.
|
||||
|
||||
To summarize, tests are made in Section 4.2 for invalid characters,
|
||||
invalid combinations of characters, for labels that are invalid even
|
||||
if the characters they contain are valid individually, and for labels
|
||||
that do not conform to the restrictions for strings containing
|
||||
right-to-left characters.
|
||||
|
||||
4.3. Registry Restrictions
|
||||
|
||||
In addition to the rules and tests above, there are many reasons why
|
||||
a registry could reject a label. Registries at all levels of the
|
||||
DNS, not just the top level, are expected to establish policies about
|
||||
label registrations. Policies are likely to be informed by the local
|
||||
languages and the scripts that are used to write them and may depend
|
||||
on many factors including what characters are in the label (for
|
||||
example, a label may be rejected based on other labels already
|
||||
registered). See the Rationale document [RFC5894], Section 3.2, for
|
||||
further discussion and recommendations about registry policies.
|
||||
|
||||
The string produced by the steps in Section 4.2 is checked and
|
||||
processed as appropriate to local registry restrictions. Application
|
||||
of those registry restrictions may result in the rejection of some
|
||||
labels or the application of special restrictions to others.
|
||||
|
||||
4.4. Punycode Conversion
|
||||
|
||||
The resulting U-label is converted to an A-label (defined in Section
|
||||
2.3.2.1 of the Definitions document [RFC5890]). The A-label is the
|
||||
encoding of the U-label according to the Punycode algorithm [RFC3492]
|
||||
with the ACE prefix "xn--" added at the beginning of the string. The
|
||||
resulting string must, of course, conform to the length limits
|
||||
imposed by the DNS. This document does not update or alter the
|
||||
Punycode algorithm specified in RFC 3492 in any way. RFC 3492 does
|
||||
make a non-normative reference to the information about the value and
|
||||
construction of the ACE prefix that appears in RFC 3490 or Nameprep
|
||||
[RFC3491]. For consistency and reader convenience, IDNA2008
|
||||
effectively updates that reference to point to this document. That
|
||||
change does not alter the prefix itself. The prefix, "xn--", is the
|
||||
same in both sets of documents.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 9]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
With the exception of the maximum string length test on Punycode
|
||||
output, the failure conditions identified in the Punycode encoding
|
||||
procedure cannot occur if the input is a U-label as determined by the
|
||||
steps in Sections 4.1 through 4.3 above.
|
||||
|
||||
4.5. Insertion in the Zone
|
||||
|
||||
The label is registered in the DNS by inserting the A-label into a
|
||||
zone.
|
||||
|
||||
5. Domain Name Lookup Protocol
|
||||
|
||||
Lookup is different from registration and different tests are applied
|
||||
on the client. Although some validity checks are necessary to avoid
|
||||
serious problems with the protocol, the lookup-side tests are more
|
||||
permissive and rely on the assumption that names that are present in
|
||||
the DNS are valid. That assumption is, however, a weak one because
|
||||
the presence of wildcards in the DNS might cause a string that is not
|
||||
actually registered in the DNS to be successfully looked up.
|
||||
|
||||
5.1. Label String Input
|
||||
|
||||
The user supplies a string in the local character set, for example,
|
||||
by typing it, clicking on it, or copying and pasting it from a
|
||||
resource identifier, e.g., a Uniform Resource Identifier (URI)
|
||||
[RFC3986] or an Internationalized Resource Identifier (IRI)
|
||||
[RFC3987], from which the domain name is extracted. Alternately,
|
||||
some process not directly involving the user may read the string from
|
||||
a file or obtain it in some other way. Processing in this step and
|
||||
the one specified in Section 5.2 are local matters, to be
|
||||
accomplished prior to actual invocation of IDNA.
|
||||
|
||||
5.2. Conversion to Unicode
|
||||
|
||||
The string is converted from the local character set into Unicode, if
|
||||
it is not already in Unicode. Depending on local needs, this
|
||||
conversion may involve mapping some characters into other characters
|
||||
as well as coding conversions. Those issues are discussed in the
|
||||
mapping-related sections (Sections 4.2, 4.4, 6, and 7.3) of the
|
||||
Rationale document [RFC5894] and in the separate Mapping document
|
||||
[IDNA2008-Mapping]. The result MUST be a Unicode string in NFC form.
|
||||
|
||||
5.3. A-label Input
|
||||
|
||||
If the input to this procedure appears to be an A-label (i.e., it
|
||||
starts in "xn--", interpreted case-insensitively), the lookup
|
||||
application MAY attempt to convert it to a U-label, first ensuring
|
||||
that the A-label is entirely in lowercase (converting it to lowercase
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 10]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
if necessary), and apply the tests of Section 5.4 and the conversion
|
||||
of Section 5.5 to that form. If the label is converted to Unicode
|
||||
(i.e., to U-label form) using the Punycode decoding algorithm, then
|
||||
the processing specified in those two sections MUST be performed, and
|
||||
the label MUST be rejected if the resulting label is not identical to
|
||||
the original. See Section 8.1 of the Rationale document [RFC5894]
|
||||
for additional discussion on this topic.
|
||||
|
||||
Conversion from the A-label and testing that the result is a U-label
|
||||
SHOULD be performed if the domain name will later be presented to the
|
||||
user in native character form (this requires that the lookup
|
||||
application be IDNA-aware). If those steps are not performed, the
|
||||
lookup process SHOULD at least test to determine that the string is
|
||||
actually an A-label, examining it for the invalid formats specified
|
||||
in the Punycode decoding specification. Applications that are not
|
||||
IDNA-aware will obviously omit that testing; others MAY treat the
|
||||
string as opaque to avoid the additional processing at the expense of
|
||||
providing less protection and information to users.
|
||||
|
||||
5.4. Validation and Character List Testing
|
||||
|
||||
As with the registration procedure described in Section 4, the
|
||||
Unicode string is checked to verify that all characters that appear
|
||||
in it are valid as input to IDNA lookup processing. As discussed
|
||||
above and in the Rationale document [RFC5894], the lookup check is
|
||||
more liberal than the registration one. Labels that have not been
|
||||
fully evaluated for conformance to the applicable rules are referred
|
||||
to as "putative" labels as discussed in Section 2.3.2.1 of the
|
||||
Definitions document [RFC5890]. Putative U-labels with any of the
|
||||
following characteristics MUST be rejected prior to DNS lookup:
|
||||
|
||||
o Labels that are not in NFC [Unicode-UAX15].
|
||||
|
||||
o Labels containing "--" (two consecutive hyphens) in the third and
|
||||
fourth character positions.
|
||||
|
||||
o Labels whose first character is a combining mark (see The Unicode
|
||||
Standard, Section 2.11 [Unicode]).
|
||||
|
||||
o Labels containing prohibited code points, i.e., those that are
|
||||
assigned to the "DISALLOWED" category of the Tables document
|
||||
[RFC5892].
|
||||
|
||||
o Labels containing code points that are identified in the Tables
|
||||
document as "CONTEXTJ", i.e., requiring exceptional contextual
|
||||
rule processing on lookup, but that do not conform to those rules.
|
||||
Note that this implies that a rule must be defined, not null: a
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 11]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
character that requires a contextual rule but for which the rule
|
||||
is null is treated in this step as having failed to conform to the
|
||||
rule.
|
||||
|
||||
o Labels containing code points that are identified in the Tables
|
||||
document as "CONTEXTO", but for which no such rule appears in the
|
||||
table of rules. Applications resolving DNS names or carrying out
|
||||
equivalent operations are not required to test contextual rules
|
||||
for "CONTEXTO" characters, only to verify that a rule is defined
|
||||
(although they MAY make such tests to provide better protection or
|
||||
give better information to the user).
|
||||
|
||||
o Labels containing code points that are unassigned in the version
|
||||
of Unicode being used by the application, i.e., in the UNASSIGNED
|
||||
category of the Tables document.
|
||||
|
||||
This requirement means that the application must use a list of
|
||||
unassigned characters that is matched to the version of Unicode
|
||||
that is being used for the other requirements in this section. It
|
||||
is not required that the application know which version of Unicode
|
||||
is being used; that information might be part of the operating
|
||||
environment in which the application is running.
|
||||
|
||||
In addition, the application SHOULD apply the following test.
|
||||
|
||||
o Verification that the string is compliant with the requirements
|
||||
for right-to-left characters specified in the Bidi document
|
||||
[RFC5893].
|
||||
|
||||
This test may be omitted in special circumstances, such as when the
|
||||
lookup application knows that the conditions are enforced elsewhere,
|
||||
because an attempt to look up and resolve such strings will almost
|
||||
certainly lead to a DNS lookup failure except when wildcards are
|
||||
present in the zone. However, applying the test is likely to give
|
||||
much better information about the reason for a lookup failure --
|
||||
information that may be usefully passed to the user when that is
|
||||
feasible -- than DNS resolution failure information alone.
|
||||
|
||||
For all other strings, the lookup application MUST rely on the
|
||||
presence or absence of labels in the DNS to determine the validity of
|
||||
those labels and the validity of the characters they contain. If
|
||||
they are registered, they are presumed to be valid; if they are not,
|
||||
their possible validity is not relevant. While a lookup application
|
||||
may reasonably issue warnings about strings it believes may be
|
||||
problematic, applications that decline to process a string that
|
||||
conforms to the rules above (i.e., does not look it up in the DNS)
|
||||
are not in conformance with this protocol.
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 12]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
5.5. Punycode Conversion
|
||||
|
||||
The string that has now been validated for lookup is converted to ACE
|
||||
form by applying the Punycode algorithm to the string and then adding
|
||||
the ACE prefix ("xn--").
|
||||
|
||||
5.6. DNS Name Resolution
|
||||
|
||||
The A-label resulting from the conversion in Section 5.5 or supplied
|
||||
directly (see Section 5.3) is combined with other labels as needed to
|
||||
form a fully-qualified domain name that is then looked up in the DNS,
|
||||
using normal DNS resolver procedures. The lookup can obviously
|
||||
either succeed (returning information) or fail.
|
||||
|
||||
6. Security Considerations
|
||||
|
||||
Security Considerations for this version of IDNA are described in the
|
||||
Definitions document [RFC5890], except for the special issues
|
||||
associated with right-to-left scripts and characters. The latter are
|
||||
discussed in the Bidi document [RFC5893].
|
||||
|
||||
In order to avoid intentional or accidental attacks from labels that
|
||||
might be confused with others, special problems in rendering, and so
|
||||
on, the IDNA model requires that registries exercise care and
|
||||
thoughtfulness about what labels they choose to permit. That issue
|
||||
is discussed in Section 4.3 of this document which, in turn, points
|
||||
to a somewhat more extensive discussion in the Rationale document
|
||||
[RFC5894].
|
||||
|
||||
7. IANA Considerations
|
||||
|
||||
IANA actions for this version of IDNA are specified in the Tables
|
||||
document [RFC5892] and discussed informally in the Rationale document
|
||||
[RFC5894]. The components of IDNA described in this document do not
|
||||
require any IANA actions.
|
||||
|
||||
8. Contributors
|
||||
|
||||
While the listed editor held the pen, the original versions of this
|
||||
document represent the joint work and conclusions of an ad hoc design
|
||||
team consisting of the editor and, in alphabetic order, Harald
|
||||
Alvestrand, Tina Dam, Patrik Faltstrom, and Cary Karp. This document
|
||||
draws significantly on the original version of IDNA [RFC3490] both
|
||||
conceptually and for specific text. This second-generation version
|
||||
would not have been possible without the work that went into that
|
||||
first version and especially the contributions of its authors Patrik
|
||||
Faltstrom, Paul Hoffman, and Adam Costello. While Faltstrom was
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 13]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
actively involved in the creation of this version, Hoffman and
|
||||
Costello were not and should not be held responsible for any errors
|
||||
or omissions.
|
||||
|
||||
9. Acknowledgments
|
||||
|
||||
This revision to IDNA would have been impossible without the
|
||||
accumulated experience since RFC 3490 was published and resulting
|
||||
comments and complaints of many people in the IETF, ICANN, and other
|
||||
communities (too many people to list here). Nor would it have been
|
||||
possible without RFC 3490 itself and the efforts of the Working Group
|
||||
that defined it. Those people whose contributions are acknowledged
|
||||
in RFC 3490, RFC 4690 [RFC4690], and the Rationale document [RFC5894]
|
||||
were particularly important.
|
||||
|
||||
Specific textual changes were incorporated into this document after
|
||||
suggestions from the other contributors, Stephane Bortzmeyer, Vint
|
||||
Cerf, Lisa Dusseault, Paul Hoffman, Kent Karlsson, James Mitchell,
|
||||
Erik van der Poel, Marcos Sanz, Andrew Sullivan, Wil Tan, Ken
|
||||
Whistler, Chris Wright, and other WG participants and reviewers
|
||||
including Martin Duerst, James Mitchell, Subramanian Moonesamy, Peter
|
||||
Saint-Andre, Margaret Wasserman, and Dan Winship who caught specific
|
||||
errors and recommended corrections. Special thanks are due to Paul
|
||||
Hoffman for permission to extract material to form the basis for
|
||||
Appendix A from a draft document that he prepared.
|
||||
|
||||
10. References
|
||||
|
||||
10.1. Normative References
|
||||
|
||||
[RFC1034] Mockapetris, P., "Domain names - concepts and
|
||||
facilities", STD 13, RFC 1034, November 1987.
|
||||
|
||||
[RFC1035] Mockapetris, P., "Domain names - implementation and
|
||||
specification", STD 13, RFC 1035, November 1987.
|
||||
|
||||
[RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
|
||||
Requirement Levels", BCP 14, RFC 2119, March 1997.
|
||||
|
||||
[RFC3492] Costello, A., "Punycode: A Bootstring encoding of
|
||||
Unicode for Internationalized Domain Names in
|
||||
Applications (IDNA)", RFC 3492, March 2003.
|
||||
|
||||
[RFC5890] Klensin, J., "Internationalized Domain Names for
|
||||
Applications (IDNA): Definitions and Document
|
||||
Framework", RFC 5890, August 2010.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 14]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
[RFC5892] Faltstrom, P., Ed., "The Unicode Code Points and
|
||||
Internationalized Domain Names for Applications (IDNA)",
|
||||
RFC 5892, August 2010.
|
||||
|
||||
[RFC5893] Alvestrand, H., Ed. and C. Karp, "Right-to-Left Scripts
|
||||
for Internationalized Domain Names for Applications
|
||||
(IDNA)", RFC 5893, August 2010.
|
||||
|
||||
[Unicode-UAX15]
|
||||
The Unicode Consortium, "Unicode Standard Annex #15:
|
||||
Unicode Normalization Forms", September 2009,
|
||||
<http://www.unicode.org/reports/tr15/>.
|
||||
|
||||
10.2. Informative References
|
||||
|
||||
[ASCII] American National Standards Institute (formerly United
|
||||
States of America Standards Institute), "USA Code for
|
||||
Information Interchange", ANSI X3.4-1968, 1968. ANSI
|
||||
X3.4-1968 has been replaced by newer versions with
|
||||
slight modifications, but the 1968 version remains
|
||||
definitive for the Internet.
|
||||
|
||||
[IDNA2008-Mapping]
|
||||
Resnick, P. and P. Hoffman, "Mapping Characters in
|
||||
Internationalized Domain Names for Applications (IDNA)",
|
||||
Work in Progress, April 2010.
|
||||
|
||||
[RFC2671] Vixie, P., "Extension Mechanisms for DNS (EDNS0)",
|
||||
RFC 2671, August 1999.
|
||||
|
||||
[RFC3490] Faltstrom, P., Hoffman, P., and A. Costello,
|
||||
"Internationalizing Domain Names in Applications
|
||||
(IDNA)", RFC 3490, March 2003.
|
||||
|
||||
[RFC3491] Hoffman, P. and M. Blanchet, "Nameprep: A Stringprep
|
||||
Profile for Internationalized Domain Names (IDN)",
|
||||
RFC 3491, March 2003.
|
||||
|
||||
[RFC3986] Berners-Lee, T., Fielding, R., and L. Masinter, "Uniform
|
||||
Resource Identifier (URI): Generic Syntax", STD 66,
|
||||
RFC 3986, January 2005.
|
||||
|
||||
[RFC3987] Duerst, M. and M. Suignard, "Internationalized Resource
|
||||
Identifiers (IRIs)", RFC 3987, January 2005.
|
||||
|
||||
[RFC4690] Klensin, J., Faltstrom, P., Karp, C., and IAB, "Review
|
||||
and Recommendations for Internationalized Domain Names
|
||||
(IDNs)", RFC 4690, September 2006.
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 15]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
[RFC4952] Klensin, J. and Y. Ko, "Overview and Framework for
|
||||
Internationalized Email", RFC 4952, July 2007.
|
||||
|
||||
[RFC5894] Klensin, J., "Internationalized Domain Names for
|
||||
Applications (IDNA): Background, Explanation, and
|
||||
Rationale", RFC 5894, August 2010.
|
||||
|
||||
[Unicode] The Unicode Consortium, "The Unicode Standard, Version
|
||||
5.0", 2007. Boston, MA, USA: Addison-Wesley. ISBN
|
||||
0-321-48091-0. This printed reference has now been
|
||||
updated online to reflect additional code points. For
|
||||
code points, the reference at the time this document was
|
||||
published is to Unicode 5.2.
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 16]
|
||||
|
||||
RFC 5891 IDNA2008 Protocol August 2010
|
||||
|
||||
|
||||
Appendix A. Summary of Major Changes from IDNA2003
|
||||
|
||||
1. Update base character set from Unicode 3.2 to Unicode version
|
||||
agnostic.
|
||||
|
||||
2. Separate the definitions for the "registration" and "lookup"
|
||||
activities.
|
||||
|
||||
3. Disallow symbol and punctuation characters except where special
|
||||
exceptions are necessary.
|
||||
|
||||
4. Remove the mapping and normalization steps from the protocol and
|
||||
have them, instead, done by the applications themselves,
|
||||
possibly in a local fashion, before invoking the protocol.
|
||||
|
||||
5. Change the way that the protocol specifies which characters are
|
||||
allowed in labels from "humans decide what the table of code
|
||||
points contains" to "decision about code points are based on
|
||||
Unicode properties plus a small exclusion list created by
|
||||
humans".
|
||||
|
||||
6. Introduce the new concept of characters that can be used only in
|
||||
specific contexts.
|
||||
|
||||
7. Allow typical words and names in languages such as Dhivehi and
|
||||
Yiddish to be expressed.
|
||||
|
||||
8. Make bidirectional domain names (delimited strings of labels,
|
||||
not just labels standing on their own) display in a less
|
||||
surprising fashion, whether they appear in obvious domain name
|
||||
contexts or as part of running text in paragraphs.
|
||||
|
||||
9. Remove the dot separator from the mandatory part of the
|
||||
protocol.
|
||||
|
||||
10. Make some currently valid labels that are not actually IDNA
|
||||
labels invalid.
|
||||
|
||||
Author's Address
|
||||
|
||||
John C Klensin
|
||||
1770 Massachusetts Ave, Ste 322
|
||||
Cambridge, MA 02140
|
||||
USA
|
||||
|
||||
Phone: +1 617 245 1457
|
||||
EMail: john+ietf@jck.com
|
||||
|
||||
|
||||
|
||||
|
||||
Klensin Standards Track [Page 17]
|
||||
|
269
packages/testcases/input/nameprep/test-vectors-00.txt
Normal file
269
packages/testcases/input/nameprep/test-vectors-00.txt
Normal file
@ -0,0 +1,269 @@
|
||||
|
||||
# struct stringprep
|
||||
# {
|
||||
# char *comment;
|
||||
# char *in;
|
||||
# char *out;
|
||||
# char *profile;
|
||||
# int flags;
|
||||
# int rc;
|
||||
# }
|
||||
{
|
||||
"Map to nothing",
|
||||
"foo\xC2\xAD\xCD\x8F\xE1\xA0\x86\xE1\xA0\x8B"
|
||||
"bar""\xE2\x80\x8B\xE2\x81\xA0""baz\xEF\xB8\x80\xEF\xB8\x88"
|
||||
"\xEF\xB8\x8F\xEF\xBB\xBF", "foobarbaz"
|
||||
},
|
||||
{
|
||||
"Case folding ASCII U+0043 U+0041 U+0046 U+0045",
|
||||
"CAFE", "cafe"
|
||||
|
||||
|
||||
|
||||
Josefsson Expires August 2, 2003 [Page 31]
|
||||
|
||||
Internet-Draft Nameprep and IDNA Test Vectors February 2003
|
||||
|
||||
|
||||
},
|
||||
{
|
||||
"Case folding 8bit U+00DF (german sharp s)",
|
||||
# There is a bug here: RicMoo
|
||||
# VV
|
||||
# "\xC3\xDF", "ss"
|
||||
"\xC3\x9f", "ss"
|
||||
},
|
||||
{
|
||||
"Case folding U+0130 (turkish capital I with dot)",
|
||||
"\xC4\xB0", "i\xcc\x87"
|
||||
},
|
||||
{
|
||||
"Case folding multibyte U+0143 U+037A",
|
||||
"\xC5\x83\xCD\xBA", "\xC5\x84 \xCE\xB9"
|
||||
},
|
||||
{
|
||||
"Case folding U+2121 U+33C6 U+1D7BB",
|
||||
"\xE2\x84\xA1\xE3\x8F\x86\xF0\x9D\x9E\xBB",
|
||||
"telc\xE2\x88\x95""kg\xCF\x83"
|
||||
},
|
||||
{
|
||||
"Normalization of U+006a U+030c U+00A0 U+00AA",
|
||||
"\x6A\xCC\x8C\xC2\xA0\xC2\xAA", "\xC7\xB0 a"
|
||||
},
|
||||
{
|
||||
"Case folding U+1FB7 and normalization",
|
||||
"\xE1\xBE\xB7", "\xE1\xBE\xB6\xCE\xB9"
|
||||
},
|
||||
{
|
||||
"Self-reverting case folding U+01F0 and normalization",
|
||||
# There is a bug here: RicMoo
|
||||
# VV
|
||||
# "\xC7\xF0", "\xC7\xB0"
|
||||
"\xC7\xb0", "\xC7\xB0"
|
||||
},
|
||||
{
|
||||
"Self-reverting case folding U+0390 and normalization",
|
||||
"\xCE\x90", "\xCE\x90"
|
||||
},
|
||||
{
|
||||
"Self-reverting case folding U+03B0 and normalization",
|
||||
"\xCE\xB0", "\xCE\xB0"
|
||||
},
|
||||
{
|
||||
"Self-reverting case folding U+1E96 and normalization",
|
||||
"\xE1\xBA\x96", "\xE1\xBA\x96"
|
||||
},
|
||||
{
|
||||
"Self-reverting case folding U+1F56 and normalization",
|
||||
"\xE1\xBD\x96", "\xE1\xBD\x96"
|
||||
},
|
||||
{
|
||||
"ASCII space character U+0020",
|
||||
|
||||
|
||||
|
||||
Josefsson Expires August 2, 2003 [Page 32]
|
||||
|
||||
Internet-Draft Nameprep and IDNA Test Vectors February 2003
|
||||
|
||||
|
||||
"\x20", "\x20"
|
||||
},
|
||||
{
|
||||
"Non-ASCII 8bit space character U+00A0",
|
||||
"\xC2\xA0", "\x20"
|
||||
},
|
||||
{
|
||||
"Non-ASCII multibyte space character U+1680",
|
||||
"\xE1\x9A\x80", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Non-ASCII multibyte space character U+2000",
|
||||
"\xE2\x80\x80", "\x20"
|
||||
},
|
||||
{
|
||||
"Zero Width Space U+200b",
|
||||
"\xE2\x80\x8b", ""
|
||||
},
|
||||
{
|
||||
"Non-ASCII multibyte space character U+3000",
|
||||
"\xE3\x80\x80", "\x20"
|
||||
},
|
||||
{
|
||||
"ASCII control characters U+0010 U+007F",
|
||||
"\x10\x7F", "\x10\x7F"
|
||||
},
|
||||
{
|
||||
"Non-ASCII 8bit control character U+0085",
|
||||
"\xC2\x85", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Non-ASCII multibyte control character U+180E",
|
||||
"\xE1\xA0\x8E", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Zero Width No-Break Space U+FEFF",
|
||||
"\xEF\xBB\xBF", ""
|
||||
},
|
||||
{
|
||||
"Non-ASCII control character U+1D175",
|
||||
"\xF0\x9D\x85\xB5", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Plane 0 private use character U+F123",
|
||||
|
||||
|
||||
|
||||
Josefsson Expires August 2, 2003 [Page 33]
|
||||
|
||||
Internet-Draft Nameprep and IDNA Test Vectors February 2003
|
||||
|
||||
|
||||
"\xEF\x84\xA3", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Plane 15 private use character U+F1234",
|
||||
"\xF3\xB1\x88\xB4", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Plane 16 private use character U+10F234",
|
||||
"\xF4\x8F\x88\xB4", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Non-character code point U+8FFFE",
|
||||
"\xF2\x8F\xBF\xBE", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Non-character code point U+10FFFF",
|
||||
"\xF4\x8F\xBF\xBF", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Surrogate code U+DF42",
|
||||
"\xED\xBD\x82", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Non-plain text character U+FFFD",
|
||||
"\xEF\xBF\xBD", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Ideographic description character U+2FF5",
|
||||
"\xE2\xBF\xB5", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Display property character U+0341",
|
||||
"\xCD\x81", "\xCC\x81"
|
||||
},
|
||||
{
|
||||
"Left-to-right mark U+200E",
|
||||
"\xE2\x80\x8E", "\xCC\x81", "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
|
||||
|
||||
|
||||
Josefsson Expires August 2, 2003 [Page 34]
|
||||
|
||||
Internet-Draft Nameprep and IDNA Test Vectors February 2003
|
||||
|
||||
|
||||
"Deprecated U+202A",
|
||||
"\xE2\x80\xAA", "\xCC\x81", "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Language tagging character U+E0001",
|
||||
"\xF3\xA0\x80\x81", "\xCC\x81", "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Language tagging character U+E0042",
|
||||
"\xF3\xA0\x81\x82", NULL, "Nameprep", 0,
|
||||
STRINGPREP_CONTAINS_PROHIBITED
|
||||
},
|
||||
{
|
||||
"Bidi: RandALCat character U+05BE and LCat characters",
|
||||
"foo\xD6\xBE""bar", NULL, "Nameprep", 0,
|
||||
STRINGPREP_BIDI_BOTH_L_AND_RAL
|
||||
},
|
||||
{
|
||||
"Bidi: RandALCat character U+FD50 and LCat characters",
|
||||
"foo\xEF\xB5\x90""bar", NULL, "Nameprep", 0,
|
||||
STRINGPREP_BIDI_BOTH_L_AND_RAL
|
||||
},
|
||||
{
|
||||
"Bidi: RandALCat character U+FB38 and LCat characters",
|
||||
"foo\xEF\xB9\xB6""bar", "foo \xd9\x8e""bar"
|
||||
},
|
||||
{ "Bidi: RandALCat without trailing RandALCat U+0627 U+0031",
|
||||
"\xD8\xA7\x31", NULL, "Nameprep", 0,
|
||||
STRINGPREP_BIDI_LEADTRAIL_NOT_RAL}
|
||||
,
|
||||
{
|
||||
"Bidi: RandALCat character U+0627 U+0031 U+0628",
|
||||
"\xD8\xA7\x31\xD8\xA8", "\xD8\xA7\x31\xD8\xA8"
|
||||
},
|
||||
{
|
||||
"Unassigned code point U+E0002",
|
||||
"\xF3\xA0\x80\x82", NULL, "Nameprep", STRINGPREP_NO_UNASSIGNED,
|
||||
STRINGPREP_CONTAINS_UNASSIGNED
|
||||
},
|
||||
{
|
||||
"Larger test (shrinking)",
|
||||
# There is a bug here: RicMoo
|
||||
# VV
|
||||
# "X\xC2\xAD\xC3\xDF\xC4\xB0\xE2\x84\xA1\x6a\xcc\x8c\xc2\xa0\xc2"
|
||||
"X\xC2\xAD\xC3\x9F\xC4\xB0\xE2\x84\xA1\x6a\xcc\x8c\xc2\xa0\xc2"
|
||||
"\xaa\xce\xb0\xe2\x80\x80", "xssi\xcc\x87""tel\xc7\xb0 a\xce\xb0 ",
|
||||
"Nameprep"
|
||||
},
|
||||
{
|
||||
|
||||
|
||||
|
||||
Josefsson Expires August 2, 2003 [Page 35]
|
||||
|
||||
Internet-Draft Nameprep and IDNA Test Vectors February 2003
|
||||
|
||||
|
||||
"Larger test (expanding)",
|
||||
# There is a bug here: RicMoo
|
||||
# VV
|
||||
# "X\xC3\xDF\xe3\x8c\x96\xC4\xB0\xE2\x84\xA1\xE2\x92\x9F\xE3\x8c\x80",
|
||||
"X\xc3\x9F\xe3\x8c\x96\xC4\xB0\xE2\x84\xA1\xE2\x92\x9F\xE3\x8c\x80",
|
||||
"xss\xe3\x82\xad\xe3\x83\xad\xe3\x83\xa1\xe3\x83\xbc\xe3\x83\x88"
|
||||
"\xe3\x83\xab""i\xcc\x87""tel\x28""d\x29\xe3\x82\xa2\xe3\x83\x91"
|
||||
"\xe3\x83\xbc\xe3\x83\x88"
|
||||
},
|
Loading…
Reference in New Issue
Block a user