00001 /* punycode.c --- Implementation of punycode used to ASCII encode IDN's. 00002 * Copyright (C) 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009 Simon Josefsson 00003 * 00004 * This file is part of GNU Libidn. 00005 * 00006 * GNU Libidn is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * GNU Libidn is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with GNU Libidn; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 00019 * 00020 */ 00021 00022 /* 00023 * This file is derived from RFC 3492bis written by Adam M. Costello. 00024 * 00025 * Disclaimer and license: Regarding this entire document or any 00026 * portion of it (including the pseudocode and C code), the author 00027 * makes no guarantees and is not responsible for any damage resulting 00028 * from its use. The author grants irrevocable permission to anyone 00029 * to use, modify, and distribute it in any way that does not diminish 00030 * the rights of anyone else to use, modify, and distribute it, 00031 * provided that redistributed derivative works do not contain 00032 * misleading author or version information. Derivative works need 00033 * not be licensed under similar terms. 00034 * 00035 * Copyright (C) The Internet Society (2003). All Rights Reserved. 00036 * 00037 * This document and translations of it may be copied and furnished to 00038 * others, and derivative works that comment on or otherwise explain it 00039 * or assist in its implementation may be prepared, copied, published 00040 * and distributed, in whole or in part, without restriction of any 00041 * kind, provided that the above copyright notice and this paragraph are 00042 * included on all such copies and derivative works. However, this 00043 * document itself may not be modified in any way, such as by removing 00044 * the copyright notice or references to the Internet Society or other 00045 * Internet organizations, except as needed for the purpose of 00046 * developing Internet standards in which case the procedures for 00047 * copyrights defined in the Internet Standards process must be 00048 * followed, or as required to translate it into languages other than 00049 * English. 00050 * 00051 * The limited permissions granted above are perpetual and will not be 00052 * revoked by the Internet Society or its successors or assigns. 00053 * 00054 * This document and the information contained herein is provided on an 00055 * "AS IS" basis and THE INTERNET SOCIETY AND THE INTERNET ENGINEERING 00056 * TASK FORCE DISCLAIMS ALL WARRANTIES, EXPRESS OR IMPLIED, INCLUDING 00057 * BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE INFORMATION 00058 * HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED WARRANTIES OF 00059 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. 00060 */ 00061 00062 #include <config.h> 00063 #include <string.h> 00064 00065 #include "punycode.h" 00066 00067 /*** Bootstring parameters for Punycode ***/ 00068 00069 enum 00070 { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700, 00071 initial_bias = 72, initial_n = 0x80, delimiter = 0x2D 00072 }; 00073 00074 /* basic(cp) tests whether cp is a basic code point: */ 00075 #define basic(cp) ((punycode_uint)(cp) < 0x80) 00076 00077 /* delim(cp) tests whether cp is a delimiter: */ 00078 #define delim(cp) ((cp) == delimiter) 00079 00080 /* decode_digit(cp) returns the numeric value of a basic code */ 00081 /* point (for use in representing integers) in the range 0 to */ 00082 /* base-1, or base if cp does not represent a value. */ 00083 00084 static punycode_uint 00085 decode_digit (punycode_uint cp) 00086 { 00087 return cp - 48 < 10 ? cp - 22 : cp - 65 < 26 ? cp - 65 : 00088 cp - 97 < 26 ? cp - 97 : base; 00089 } 00090 00091 /* encode_digit(d,flag) returns the basic code point whose value */ 00092 /* (when used for representing integers) is d, which needs to be in */ 00093 /* the range 0 to base-1. The lowercase form is used unless flag is */ 00094 /* nonzero, in which case the uppercase form is used. The behavior */ 00095 /* is undefined if flag is nonzero and digit d has no uppercase form. */ 00096 00097 static char 00098 encode_digit (punycode_uint d, int flag) 00099 { 00100 return d + 22 + 75 * (d < 26) - ((flag != 0) << 5); 00101 /* 0..25 map to ASCII a..z or A..Z */ 00102 /* 26..35 map to ASCII 0..9 */ 00103 } 00104 00105 /* flagged(bcp) tests whether a basic code point is flagged */ 00106 /* (uppercase). The behavior is undefined if bcp is not a */ 00107 /* basic code point. */ 00108 00109 #define flagged(bcp) ((punycode_uint)(bcp) - 65 < 26) 00110 00111 /* encode_basic(bcp,flag) forces a basic code point to lowercase */ 00112 /* if flag is zero, uppercase if flag is nonzero, and returns */ 00113 /* the resulting code point. The code point is unchanged if it */ 00114 /* is caseless. The behavior is undefined if bcp is not a basic */ 00115 /* code point. */ 00116 00117 static char 00118 encode_basic (punycode_uint bcp, int flag) 00119 { 00120 bcp -= (bcp - 97 < 26) << 5; 00121 return bcp + ((!flag && (bcp - 65 < 26)) << 5); 00122 } 00123 00124 /*** Platform-specific constants ***/ 00125 00126 /* maxint is the maximum value of a punycode_uint variable: */ 00127 static const punycode_uint maxint = -1; 00128 /* Because maxint is unsigned, -1 becomes the maximum value. */ 00129 00130 /*** Bias adaptation function ***/ 00131 00132 static punycode_uint 00133 adapt (punycode_uint delta, punycode_uint numpoints, int firsttime) 00134 { 00135 punycode_uint k; 00136 00137 delta = firsttime ? delta / damp : delta >> 1; 00138 /* delta >> 1 is a faster way of doing delta / 2 */ 00139 delta += delta / numpoints; 00140 00141 for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base) 00142 { 00143 delta /= base - tmin; 00144 } 00145 00146 return k + (base - tmin + 1) * delta / (delta + skew); 00147 } 00148 00149 /*** Main encode function ***/ 00150 00188 int 00189 punycode_encode (size_t input_length, 00190 const punycode_uint input[], 00191 const unsigned char case_flags[], 00192 size_t * output_length, char output[]) 00193 { 00194 punycode_uint input_len, n, delta, h, b, bias, j, m, q, k, t; 00195 size_t out, max_out; 00196 00197 /* The Punycode spec assumes that the input length is the same type */ 00198 /* of integer as a code point, so we need to convert the size_t to */ 00199 /* a punycode_uint, which could overflow. */ 00200 00201 if (input_length > maxint) 00202 return punycode_overflow; 00203 input_len = (punycode_uint) input_length; 00204 00205 /* Initialize the state: */ 00206 00207 n = initial_n; 00208 delta = 0; 00209 out = 0; 00210 max_out = *output_length; 00211 bias = initial_bias; 00212 00213 /* Handle the basic code points: */ 00214 00215 for (j = 0; j < input_len; ++j) 00216 { 00217 if (basic (input[j])) 00218 { 00219 if (max_out - out < 2) 00220 return punycode_big_output; 00221 output[out++] = case_flags ? 00222 encode_basic (input[j], case_flags[j]) : (char) input[j]; 00223 } 00224 /* else if (input[j] < n) return punycode_bad_input; */ 00225 /* (not needed for Punycode with unsigned code points) */ 00226 } 00227 00228 h = b = (punycode_uint) out; 00229 /* cannot overflow because out <= input_len <= maxint */ 00230 00231 /* h is the number of code points that have been handled, b is the */ 00232 /* number of basic code points, and out is the number of ASCII code */ 00233 /* points that have been output. */ 00234 00235 if (b > 0) 00236 output[out++] = delimiter; 00237 00238 /* Main encoding loop: */ 00239 00240 while (h < input_len) 00241 { 00242 /* All non-basic code points < n have been */ 00243 /* handled already. Find the next larger one: */ 00244 00245 for (m = maxint, j = 0; j < input_len; ++j) 00246 { 00247 /* if (basic(input[j])) continue; */ 00248 /* (not needed for Punycode) */ 00249 if (input[j] >= n && input[j] < m) 00250 m = input[j]; 00251 } 00252 00253 /* Increase delta enough to advance the decoder's */ 00254 /* <n,i> state to <m,0>, but guard against overflow: */ 00255 00256 if (m - n > (maxint - delta) / (h + 1)) 00257 return punycode_overflow; 00258 delta += (m - n) * (h + 1); 00259 n = m; 00260 00261 for (j = 0; j < input_len; ++j) 00262 { 00263 /* Punycode does not need to check whether input[j] is basic: */ 00264 if (input[j] < n /* || basic(input[j]) */ ) 00265 { 00266 if (++delta == 0) 00267 return punycode_overflow; 00268 } 00269 00270 if (input[j] == n) 00271 { 00272 /* Represent delta as a generalized variable-length integer: */ 00273 00274 for (q = delta, k = base;; k += base) 00275 { 00276 if (out >= max_out) 00277 return punycode_big_output; 00278 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ 00279 k >= bias + tmax ? tmax : k - bias; 00280 if (q < t) 00281 break; 00282 output[out++] = encode_digit (t + (q - t) % (base - t), 0); 00283 q = (q - t) / (base - t); 00284 } 00285 00286 output[out++] = encode_digit (q, case_flags && case_flags[j]); 00287 bias = adapt (delta, h + 1, h == b); 00288 delta = 0; 00289 ++h; 00290 } 00291 } 00292 00293 ++delta, ++n; 00294 } 00295 00296 *output_length = out; 00297 return punycode_success; 00298 } 00299 00300 /*** Main decode function ***/ 00301 00337 int 00338 punycode_decode (size_t input_length, 00339 const char input[], 00340 size_t * output_length, 00341 punycode_uint output[], unsigned char case_flags[]) 00342 { 00343 punycode_uint n, out, i, max_out, bias, oldi, w, k, digit, t; 00344 size_t b, j, in; 00345 00346 /* Initialize the state: */ 00347 00348 n = initial_n; 00349 out = i = 0; 00350 max_out = *output_length > maxint ? maxint 00351 : (punycode_uint) * output_length; 00352 bias = initial_bias; 00353 00354 /* Handle the basic code points: Let b be the number of input code */ 00355 /* points before the last delimiter, or 0 if there is none, then */ 00356 /* copy the first b code points to the output. */ 00357 00358 for (b = j = 0; j < input_length; ++j) 00359 if (delim (input[j])) 00360 b = j; 00361 if (b > max_out) 00362 return punycode_big_output; 00363 00364 for (j = 0; j < b; ++j) 00365 { 00366 if (case_flags) 00367 case_flags[out] = flagged (input[j]); 00368 if (!basic (input[j])) 00369 return punycode_bad_input; 00370 output[out++] = input[j]; 00371 } 00372 00373 /* Main decoding loop: Start just after the last delimiter if any */ 00374 /* basic code points were copied; start at the beginning otherwise. */ 00375 00376 for (in = b > 0 ? b + 1 : 0; in < input_length; ++out) 00377 { 00378 00379 /* in is the index of the next ASCII code point to be consumed, */ 00380 /* and out is the number of code points in the output array. */ 00381 00382 /* Decode a generalized variable-length integer into delta, */ 00383 /* which gets added to i. The overflow checking is easier */ 00384 /* if we increase i as we go, then subtract off its starting */ 00385 /* value at the end to obtain delta. */ 00386 00387 for (oldi = i, w = 1, k = base;; k += base) 00388 { 00389 if (in >= input_length) 00390 return punycode_bad_input; 00391 digit = decode_digit (input[in++]); 00392 if (digit >= base) 00393 return punycode_bad_input; 00394 if (digit > (maxint - i) / w) 00395 return punycode_overflow; 00396 i += digit * w; 00397 t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */ 00398 k >= bias + tmax ? tmax : k - bias; 00399 if (digit < t) 00400 break; 00401 if (w > maxint / (base - t)) 00402 return punycode_overflow; 00403 w *= (base - t); 00404 } 00405 00406 bias = adapt (i - oldi, out + 1, oldi == 0); 00407 00408 /* i was supposed to wrap around from out+1 to 0, */ 00409 /* incrementing n each time, so we'll fix that now: */ 00410 00411 if (i / (out + 1) > maxint - n) 00412 return punycode_overflow; 00413 n += i / (out + 1); 00414 i %= (out + 1); 00415 00416 /* Insert n at position i of the output: */ 00417 00418 /* not needed for Punycode: */ 00419 /* if (basic(n)) return punycode_invalid_input; */ 00420 if (out >= max_out) 00421 return punycode_big_output; 00422 00423 if (case_flags) 00424 { 00425 memmove (case_flags + i + 1, case_flags + i, out - i); 00426 /* Case of last ASCII code point determines case flag: */ 00427 case_flags[i] = flagged (input[in - 1]); 00428 } 00429 00430 memmove (output + i + 1, output + i, (out - i) * sizeof *output); 00431 output[i++] = n; 00432 } 00433 00434 *output_length = (size_t) out; 00435 /* cannot overflow because out <= old value of *output_length */ 00436 return punycode_success; 00437 } 00438