Code coverage for /20080809/includes/unicode.inc

Line #Times calledCode
1
<?php
2
// $Id: unicode.inc,v 1.31 2008/06/18 03:36:23 dries Exp $
3
4
/**
5
 * Indicates an error during check for PHP unicode support.
6
 */
72027
define('UNICODE_ERROR', -1);
8
9
/**
10
 * Indicates that standard PHP (emulated) unicode support is being used.
11
 */
122027
define('UNICODE_SINGLEBYTE', 0);
13
14
/**
15
 * Indicates that full unicode support with the PHP mbstring extension is
being
16
 * used.
17
 */
182027
define('UNICODE_MULTIBYTE', 1);
19
20
/**
21
 * Wrapper around _unicode_check().
22
 */
232027
function unicode_check() {
242027
  list($GLOBALS['multibyte']) = _unicode_check();
252027
}
26
27
/**
28
 * Perform checks about Unicode support in PHP, and set the right settings
if
29
 * needed.
30
 *
31
 * Because Drupal needs to be able to handle text in various encodings, we
do
32
 * not support mbstring function overloading. HTTP input/output conversion
must
33
 * be disabled for similar reasons.
34
 *
35
 * @param $errors
36
 *   Whether to report any fatal errors with form_set_error().
37
 */
382027
function _unicode_check() {
39
  // Ensure translations don't break at install time
402027
  $t = get_t();
41
42
  // Set the standard C locale to ensure consistent, ASCII-only string
handling.
432027
  setlocale(LC_CTYPE, 'C');
44
45
  // Check for outdated PCRE library
46
  // Note: we check if U+E2 is in the range U+E0 - U+E1. This test returns
TRUE on old PCRE versions.
472027
  if (preg_match('/[à-á]/u', 'â')) {
480
    return array(UNICODE_ERROR, $t('The PCRE library in your PHP
installation is outdated. This will cause problems when handling Unicode
text. If you are running PHP 4.3.3 or higher, make sure you are using the
PCRE library supplied by PHP. Please refer to the <a href="@url">PHP PCRE
documentation</a> for more information.', array('@url' =>
'http://www.php.net/pcre')));
490
  }
50
51
  // Check for mbstring extension
522027
  if (!function_exists('mb_strlen')) {
530
    return array(UNICODE_SINGLEBYTE, $t('Operations on Unicode strings are
emulated on a best-effort basis. Install the <a href="@url">PHP mbstring
extension</a> for improved Unicode support.', array('@url' =>
'http://www.php.net/mbstring')));
540
  }
55
56
  // Check mbstring configuration
572027
  if (ini_get('mbstring.func_overload') != 0) {
580
    return array(UNICODE_ERROR, $t('Multibyte string function overloading
in PHP is active and must be disabled. Check the php.ini
<em>mbstring.func_overload</em> setting. Please refer to the <a
href="@url">PHP mbstring documentation</a> for more information.',
array('@url' => 'http://www.php.net/mbstring')));
590
  }
602027
  if (ini_get('mbstring.encoding_translation') != 0) {
610
    return array(UNICODE_ERROR, $t('Multibyte string input conversion in
PHP is active and must be disabled. Check the php.ini
<em>mbstring.encoding_translation</em> setting. Please refer to the <a
href="@url">PHP mbstring documentation</a> for more information.',
array('@url' => 'http://www.php.net/mbstring')));
620
  }
632027
  if (ini_get('mbstring.http_input') != 'pass') {
640
    return array(UNICODE_ERROR, $t('Multibyte string input conversion in
PHP is active and must be disabled. Check the php.ini
<em>mbstring.http_input</em> setting. Please refer to the <a
href="@url">PHP mbstring documentation</a> for more information.',
array('@url' => 'http://www.php.net/mbstring')));
650
  }
662027
  if (ini_get('mbstring.http_output') != 'pass') {
670
    return array(UNICODE_ERROR, $t('Multibyte string output conversion in
PHP is active and must be disabled. Check the php.ini
<em>mbstring.http_output</em> setting. Please refer to the <a
href="@url">PHP mbstring documentation</a> for more information.',
array('@url' => 'http://www.php.net/mbstring')));
680
  }
69
70
  // Set appropriate configuration
712027
  mb_internal_encoding('utf-8');
722027
  mb_language('uni');
732027
  return array(UNICODE_MULTIBYTE, '');
740
}
75
76
/**
77
 * Return Unicode library status and errors.
78
 */
792027
function unicode_requirements() {
80
  // Ensure translations don't break at install time
813
  $t = get_t();
82
83
  $libraries = array(
843
    UNICODE_SINGLEBYTE => $t('Standard PHP'),
853
    UNICODE_MULTIBYTE => $t('PHP Mbstring Extension'),
863
    UNICODE_ERROR => $t('Error'),
873
  );
88
  $severities = array(
893
    UNICODE_SINGLEBYTE => REQUIREMENT_WARNING,
903
    UNICODE_MULTIBYTE => REQUIREMENT_OK,
913
    UNICODE_ERROR => REQUIREMENT_ERROR,
923
  );
933
  list($library, $description) = _unicode_check();
94
953
  $requirements['unicode'] = array(
963
    'title' => $t('Unicode library'),
973
    'value' => $libraries[$library],
98
  );
993
  if ($description) {
1000
    $requirements['unicode']['description'] = $description;
1010
  }
102
1033
  $requirements['unicode']['severity'] = $severities[$library];
104
1053
  return $requirements;
1060
}
107
108
/**
109
 * Prepare a new XML parser.
110
 *
111
 * This is a wrapper around xml_parser_create() which extracts the encoding
from
112
 * the XML data first and sets the output encoding to UTF-8. This function
should
113
 * be used instead of xml_parser_create(), because PHP 4's XML parser
doesn't
114
 * check the input encoding itself. "Starting from PHP 5, the input
encoding is
115
 * automatically detected, so that the encoding parameter specifies only
the
116
 * output encoding."
117
 *
118
 * This is also where unsupported encodings will be converted. Callers
should
119
 * take this into account: $data might have been changed after the call.
120
 *
121
 * @param &$data
122
 *   The XML data which will be parsed later.
123
 * @return
124
 *   An XML parser object or FALSE on error.
125
 */
1262027
function drupal_xml_parser_create(&$data) {
127
  // Default XML encoding is UTF-8
1286
  $encoding = 'utf-8';
1296
  $bom = FALSE;
130
131
  // Check for UTF-8 byte order mark (PHP5's XML parser doesn't handle
it).
1326
  if (!strncmp($data, "\xEF\xBB\xBF", 3)) {
1330
    $bom = TRUE;
1340
    $data = substr($data, 3);
1350
  }
136
137
  // Check for an encoding declaration in the XML prolog if no BOM was
found.
1386
  if (!$bom && ereg('^<\?xml[^>]+encoding="([^"]+)"', $data, $match)) {
1394
    $encoding = $match[1];
1404
  }
141
142
  // Unsupported encodings are converted here into UTF-8.
1436
  $php_supported = array('utf-8', 'iso-8859-1', 'us-ascii');
1446
  if (!in_array(strtolower($encoding), $php_supported)) {
1450
    $out = drupal_convert_to_utf8($data, $encoding);
1460
    if ($out !== FALSE) {
1470
      $encoding = 'utf-8';
1480
      $data = ereg_replace('^(<\?xml[^>]+encoding)="([^"]+)"',
'\\1="utf-8"', $out);
1490
    }
150
    else {
1510
      watchdog('php', 'Could not convert XML encoding %s to UTF-8.',
array('%s' => $encoding), WATCHDOG_WARNING);
1520
      return FALSE;
153
    }
1540
  }
155
1566
  $xml_parser = xml_parser_create($encoding);
1576
  xml_parser_set_option($xml_parser, XML_OPTION_TARGET_ENCODING, 'utf-8');
1586
  return $xml_parser;
1590
}
160
161
/**
162
 * Convert data to UTF-8
163
 *
164
 * Requires the iconv, GNU recode or mbstring PHP extension.
165
 *
166
 * @param $data
167
 *   The data to be converted.
168
 * @param $encoding
169
 *   The encoding that the data is in
170
 * @return
171
 *   Converted data or FALSE.
172
 */
1732027
function drupal_convert_to_utf8($data, $encoding) {
1740
  if (function_exists('iconv')) {
1750
    $out = @iconv($encoding, 'utf-8', $data);
1760
  }
1770
  else if (function_exists('mb_convert_encoding')) {
1780
    $out = @mb_convert_encoding($data, 'utf-8', $encoding);
1790
  }
1800
  else if (function_exists('recode_string')) {
1810
    $out = @recode_string($encoding . '..utf-8', $data);
1820
  }
183
  else {
1840
    watchdog('php', 'Unsupported encoding %s. Please install iconv, GNU
recode or mbstring for PHP.', array('%s' => $encoding), WATCHDOG_ERROR);
1850
    return FALSE;
186
  }
187
1880
  return $out;
1890
}
190
191
/**
192
 * Truncate a UTF-8-encoded string safely to a number of bytes.
193
 *
194
 * If the end position is in the middle of a UTF-8 sequence, it scans
backwards
195
 * until the beginning of the byte sequence.
196
 *
197
 * Use this function whenever you want to chop off a string at an unsure
198
 * location. On the other hand, if you're sure that you're splitting on a
199
 * character boundary (e.g. after using strpos() or similar), you can
safely use
200
 * substr() instead.
201
 *
202
 * @param $string
203
 *   The string to truncate.
204
 * @param $len
205
 *   An upper limit on the returned string length.
206
 * @return
207
 *   The truncated string.
208
 */
2092027
function drupal_truncate_bytes($string, $len) {
2100
  if (strlen($string) <= $len) {
2110
    return $string;
2120
  }
2130
  if ((ord($string[$len]) < 0x80) || (ord($string[$len]) >= 0xC0)) {
2140
    return substr($string, 0, $len);
2150
  }
2160
  while (--$len >= 0 && ord($string[$len]) >= 0x80 && ord($string[$len]) <
0xC0) {};
2170
  return substr($string, 0, $len);
2180
}
219
220
/**
221
 * Truncate a UTF-8-encoded string safely to a number of characters.
222
 *
223
 * @param $string
224
 *   The string to truncate.
225
 * @param $len
226
 *   An upper limit on the returned string length.
227
 * @param $wordsafe
228
 *   Flag to truncate at last space within the upper limit. Defaults to
FALSE.
229
 * @param $dots
230
 *   Flag to add trailing dots. Defaults to FALSE.
231
 * @return
232
 *   The truncated string.
233
 */
2342027
function truncate_utf8($string, $len, $wordsafe = FALSE, $dots = FALSE) {
235
236166
  if (drupal_strlen($string) <= $len) {
237165
    return $string;
2380
  }
239
24015
  if ($dots) {
2418
    $len -= 4;
2428
  }
243
24415
  if ($wordsafe) {
24514
    $string = drupal_substr($string, 0, $len + 1); // leave one more
character
24614
    if ($last_space = strrpos($string, ' ')) { // space exists AND is not
on position 0
2478
      $string = substr($string, 0, $last_space);
2488
    }
249
    else {
2506
      $string = drupal_substr($string, 0, $len);
251
    }
25214
  }
253
  else {
2541
    $string = drupal_substr($string, 0, $len);
255
  }
256
25715
  if ($dots) {
2588
    $string .= ' ...';
2598
  }
260
26115
  return $string;
2620
}
263
264
/**
265
 * Encodes MIME/HTTP header values that contain non-ASCII, UTF-8 encoded
266
 * characters.
267
 *
268
 * For example, mime_header_encode('tést.txt') returns
"=?UTF-8?B?dMOpc3QudHh0?=".
269
 *
270
 * See http://www.rfc-editor.org/rfc/rfc2047.txt for more information.
271
 *
272
 * Notes:
273
 * - Only encode strings that contain non-ASCII characters.
274
 * - We progressively cut-off a chunk with truncate_utf8(). This is to
ensure
275
 *   each chunk starts and ends on a character boundary.
276
 * - Using \n as the chunk separator may cause problems on some systems and
may
277
 *   have to be changed to \r\n or \r.
278
 */
2792027
function mime_header_encode($string) {
2806
  if (preg_match('/[^\x20-\x7E]/', $string)) {
2810
    $chunk_size = 47; // floor((75 - strlen("=?UTF-8?B??=")) * 0.75);
2820
    $len = strlen($string);
2830
    $output = '';
2840
    while ($len > 0) {
2850
      $chunk = drupal_truncate_bytes($string, $chunk_size);
2860
      $output .= ' =?UTF-8?B?' . base64_encode($chunk) . "?=\n";
2870
      $c = strlen($chunk);
2880
      $string = substr($string, $c);
2890
      $len -= $c;
2900
    }
2910
    return trim($output);
2920
  }
2936
  return $string;
2940
}
295
296
/**
297
 * Complement to mime_header_encode
298
 */
2992027
function mime_header_decode($header) {
300
  // First step: encoded chunks followed by other encoded chunks (need to
collapse whitespace)
3010
  $header =
preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=\s+(?==\?)/',
'_mime_header_decode', $header);
302
  // Second step: remaining chunks (do not collapse whitespace)
3030
  return preg_replace_callback('/=\?([^?]+)\?(Q|B)\?([^?]+|\?(?!=))\?=/',
'_mime_header_decode', $header);
3040
}
305
306
/**
307
 * Helper function to mime_header_decode
308
 */
3092027
function _mime_header_decode($matches) {
310
  // Regexp groups:
311
  // 1: Character set name
312
  // 2: Escaping method (Q or B)
313
  // 3: Encoded data
3140
  $data = ($matches[2] == 'B') ? base64_decode($matches[3]) :
str_replace('_', ' ', quoted_printable_decode($matches[3]));
3150
  if (strtolower($matches[1]) != 'utf-8') {
3160
    $data = drupal_convert_to_utf8($data, $matches[1]);
3170
  }
3180
  return $data;
3190
}
320
321
/**
322
 * Decode all HTML entities (including numerical ones) to regular UTF-8
bytes.
323
 * Double-escaped entities will only be decoded once ("&amp;lt;" becomes
"&lt;", not "<").
324
 *
325
 * @param $text
326
 *   The text to decode entities in.
327
 * @param $exclude
328
 *   An array of characters which should not be decoded. For example,
329
 *   array('<', '&', '"'). This affects both named and numerical entities.
330
 */
3312027
function decode_entities($text, $exclude = array()) {
33214
  static $table;
333
  // We store named entities in a table for quick processing.
33414
  if (!isset($table)) {
335
    // Get all named HTML entities.
33614
    $table = array_flip(get_html_translation_table(HTML_ENTITIES));
337
    // PHP gives us ISO-8859-1 data, we need UTF-8.
33814
    $table = array_map('utf8_encode', $table);
339
    // Add apostrophe (XML)
34014
    $table['&apos;'] = "'";
34114
  }
34214
  $newtable = array_diff($table, $exclude);
343
344
  // Use a regexp to select all entities in one pass, to avoid decoding
double-escaped entities twice.
34514
  return preg_replace('/&(#x?)?([A-Za-z0-9]+);/e', '_decode_entities("$1",
"$2", "$0", $newtable, $exclude)', $text);
3460
}
347
348
/**
349
 * Helper function for decode_entities
350
 */
3512027
function _decode_entities($prefix, $codepoint, $original, &$table,
&$exclude) {
352
  // Named entity
3530
  if (!$prefix) {
3540
    if (isset($table[$original])) {
3550
      return $table[$original];
3560
    }
357
    else {
3580
      return $original;
359
    }
3600
  }
361
  // Hexadecimal numerical entity
3620
  if ($prefix == '#x') {
3630
    $codepoint = base_convert($codepoint, 16, 10);
3640
  }
365
  // Decimal numerical entity (strip leading zeros to avoid PHP octal
notation)
366
  else {
3670
    $codepoint = preg_replace('/^0+/', '', $codepoint);
368
  }
369
  // Encode codepoint as UTF-8 bytes
3700
  if ($codepoint < 0x80) {
3710
    $str = chr($codepoint);
3720
  }
3730
  else if ($codepoint < 0x800) {
3740
    $str = chr(0xC0 | ($codepoint >> 6))
3750
         . chr(0x80 | ($codepoint & 0x3F));
3760
  }
3770
  else if ($codepoint < 0x10000) {
3780
    $str = chr(0xE0 | ( $codepoint >> 12))
3790
         . chr(0x80 | (($codepoint >> 6) & 0x3F))
3800
         . chr(0x80 | ( $codepoint       & 0x3F));
3810
  }
3820
  else if ($codepoint < 0x200000) {
3830
    $str = chr(0xF0 | ( $codepoint >> 18))
3840
         . chr(0x80 | (($codepoint >> 12) & 0x3F))
3850
         . chr(0x80 | (($codepoint >> 6)  & 0x3F))
3860
         . chr(0x80 | ( $codepoint        & 0x3F));
3870
  }
388
  // Check for excluded characters
3890
  if (in_array($str, $exclude)) {
3900
    return $original;
3910
  }
392
  else {
3930
    return $str;
394
  }
3950
}
396
397
/**
398
 * Count the amount of characters in a UTF-8 string. This is less than or
399
 * equal to the byte count.
400
 */
4012027
function drupal_strlen($text) {
402741
  global $multibyte;
403741
  if ($multibyte == UNICODE_MULTIBYTE) {
404741
    return mb_strlen($text);
4050
  }
406
  else {
407
    // Do not count UTF-8 continuation bytes.
4080
    return strlen(preg_replace("/[\x80-\xBF]/", '', $text));
409
  }
4100
}
411
412
/**
413
 * Uppercase a UTF-8 string.
414
 */
4152027
function drupal_strtoupper($text) {
416149
  global $multibyte;
417149
  if ($multibyte == UNICODE_MULTIBYTE) {
418149
    return mb_strtoupper($text);
4190
  }
420
  else {
421
    // Use C-locale for ASCII-only uppercase
4220
    $text = strtoupper($text);
423
    // Case flip Latin-1 accented letters
4240
    $text = preg_replace_callback('/\xC3[\xA0-\xB6\xB8-\xBE]/',
'_unicode_caseflip', $text);
4250
    return $text;
426
  }
4270
}
428
429
/**
430
 * Lowercase a UTF-8 string.
431
 */
4322027
function drupal_strtolower($text) {
4331501
  global $multibyte;
4341501
  if ($multibyte == UNICODE_MULTIBYTE) {
4351501
    return mb_strtolower($text);
4360
  }
437
  else {
438
    // Use C-locale for ASCII-only lowercase
4390
    $text = strtolower($text);
440
    // Case flip Latin-1 accented letters
4410
    $text = preg_replace_callback('/\xC3[\x80-\x96\x98-\x9E]/',
'_unicode_caseflip', $text);
4420
    return $text;
443
  }
4440
}
445
446
/**
447
 * Helper function for case conversion of Latin-1.
448
 * Used for flipping U+C0-U+DE to U+E0-U+FD and back.
449
 */
4502027
function _unicode_caseflip($matches) {
4510
  return $matches[0][0] . chr(ord($matches[0][1]) ^ 32);
4520
}
453
454
/**
455
 * Capitalize the first letter of a UTF-8 string.
456
 */
4572027
function drupal_ucfirst($text) {
458
  // Note: no mbstring equivalent!
459102
  return drupal_strtoupper(drupal_substr($text, 0, 1)) .
drupal_substr($text, 1);
4600
}
461
462
/**
463
 * Cut off a piece of a string based on character indices and counts.
Follows
464
 * the same behavior as PHP's own substr() function.
465
 *
466
 * Note that for cutting off a string at a known character/substring
467
 * location, the usage of PHP's normal strpos/substr is safe and
468
 * much faster.
469
 */
4702027
function drupal_substr($text, $start, $length = NULL) {
471210
  global $multibyte;
472210
  if ($multibyte == UNICODE_MULTIBYTE) {
473210
    return $length === NULL ? mb_substr($text, $start) : mb_substr($text,
$start, $length);
4740
  }
475
  else {
4760
    $strlen = strlen($text);
477
    // Find the starting byte offset
4780
    $bytes = 0;
4790
    if ($start > 0) {
480
      // Count all the continuation bytes from the start until we have
found
481
      // $start characters
4820
      $bytes = -1; $chars = -1;
4830
      while ($bytes < $strlen && $chars < $start) {
4840
        $bytes++;
4850
        $c = ord($text[$bytes]);
4860
        if ($c < 0x80 || $c >= 0xC0) {
4870
          $chars++;
4880
        }
4890
      }
4900
    }
4910
    else if ($start < 0) {
492
      // Count all the continuation bytes from the end until we have found
493
      // abs($start) characters
4940
      $start = abs($start);
4950
      $bytes = $strlen; $chars = 0;
4960
      while ($bytes > 0 && $chars < $start) {
4970
        $bytes--;
4980
        $c = ord($text[$bytes]);
4990
        if ($c < 0x80 || $c >= 0xC0) {
5000
          $chars++;
5010
        }
5020
      }
5030
    }
5040
    $istart = $bytes;
505
506
    // Find the ending byte offset
5070
    if ($length === NULL) {
5080
      $bytes = $strlen - 1;
5090
    }
5100
    else if ($length > 0) {
511
      // Count all the continuation bytes from the starting index until we
have
512
      // found $length + 1 characters. Then backtrack one byte.
5130
      $bytes = $istart; $chars = 0;
5140
      while ($bytes < $strlen && $chars < $length) {
5150
        $bytes++;
5160
        $c = ord($text[$bytes]);
5170
        if ($c < 0x80 || $c >= 0xC0) {
5180
          $chars++;
5190
        }
5200
      }
5210
      $bytes--;
5220
    }
5230
    else if ($length < 0) {
524
      // Count all the continuation bytes from the end until we have found
525
      // abs($length) characters
5260
      $length = abs($length);
5270
      $bytes = $strlen - 1; $chars = 0;
5280
      while ($bytes >= 0 && $chars < $length) {
5290
        $c = ord($text[$bytes]);
5300
        if ($c < 0x80 || $c >= 0xC0) {
5310
          $chars++;
5320
        }
5330
        $bytes--;
5340
      }
5350
    }
5360
    $iend = $bytes;
537
5380
    return substr($text, $istart, max(0, $iend - $istart + 1));
539
  }
5400
}
541
542
5432027