Source for file specials.php

Documentation is available at specials.php

  1. <?php
  2. /**
  3. * Utilities for processing "special" characters in UTF-8. "Special" largely means anything which would
  4. * be regarded as a non-word character, like ASCII control characters and punctuation. This has a "Roman"
  5. * bias - it would be unaware of modern Chinese "punctuation" characters for example.
  6. * Note: requires utils/unicode.php to be loaded
  7. @version $Id: specials.php,v 1.1 2007/09/09 20:39:51 pitlinz Exp $
  8. @package utf8
  9. @subpackage utils
  10. @see utf8_is_valid
  11. */
  12.  
  13. //--------------------------------------------------------------------
  14. /**
  15. * Used internally. Builds a PCRE pattern from the $UTF8_SPECIAL_CHARS
  16. * array defined in this file
  17. * The $UTF8_SPECIAL_CHARS should contain all special characters (non-letter/non-digit)
  18. * defined in the various local charsets - it's not a complete list of
  19. * non-alphanum characters in UTF-8. It's not perfect but should match most
  20. * cases of special chars.
  21. * This function adds the control chars 0x00 to 0x19 to the array of
  22. * special chars (they are not included in $UTF8_SPECIAL_CHARS)
  23. @package utf8
  24. @subpackage utils
  25. @return string 
  26. @see utf8_from_unicode
  27. @see utf8_is_word_chars
  28. @see utf8_strip_specials
  29. */
  30. function utf8_specials_pattern({
  31.     static $pattern NULL;
  32.     
  33.     if !$pattern {
  34.         $UTF8_SPECIAL_CHARS array(
  35.     0x001a0x001b0x001c0x001d0x001e0x001f0x00200x00210x00220x0023,
  36.     0x00240x00250x00260x00270x00280x00290x002a0x002b0x002c,
  37.     0x002f,         0x003b0x003c0x003d0x003e0x003f0x00400x005b,
  38.     0x005c0x005d0x005e,         0x00600x007b0x007c0x007d0x007e,
  39.     0x007f0x00800x00810x00820x00830x00840x00850x00860x00870x0088,
  40.     0x00890x008a0x008b0x008c0x008d0x008e0x008f0x00900x00910x0092,
  41.     0x00930x00940x00950x00960x00970x00980x00990x009a0x009b0x009c,
  42.     0x009d0x009e0x009f0x00a00x00a10x00a20x00a30x00a40x00a50x00a6,
  43.     0x00a70x00a80x00a90x00aa0x00ab0x00ac0x00ad0x00ae0x00af0x00b0,
  44.     0x00b10x00b20x00b30x00b40x00b50x00b60x00b70x00b80x00b90x00ba,
  45.     0x00bb0x00bc0x00bd0x00be0x00bf0x00d70x00f70x02c70x02d80x02d9,
  46.     0x02da0x02db0x02dc0x02dd0x03000x03010x03030x03090x03230x0384,
  47.     0x03850x03870x03b20x03c60x03d10x03d20x03d50x03d60x05b00x05b1,
  48.     0x05b20x05b30x05b40x05b50x05b60x05b70x05b80x05b90x05bb0x05bc,
  49.     0x05bd0x05be0x05bf0x05c00x05c10x05c20x05c30x05f30x05f40x060c,
  50.     0x061b0x061f0x06400x064b0x064c0x064d0x064e0x064f0x06500x0651,
  51.     0x06520x066a0x0e3f0x200c0x200d0x200e0x200f0x20130x20140x2015,
  52.     0x20170x20180x20190x201a0x201c0x201d0x201e0x20200x20210x2022,
  53.     0x20260x20300x20320x20330x20390x203a0x20440x20a70x20aa0x20ab,
  54.     0x20ac0x21160x21180x21220x21260x21350x21900x21910x21920x2193,
  55.     0x21940x21950x21b50x21d00x21d10x21d20x21d30x21d40x22000x2202,
  56.     0x22030x22050x22060x22070x22080x22090x220b0x220f0x22110x2212,
  57.     0x22150x22170x22190x221a0x221d0x221e0x22200x22270x22280x2229,
  58.     0x222a0x222b0x22340x223c0x22450x22480x22600x22610x22640x2265,
  59.     0x22820x22830x22840x22860x22870x22950x22970x22a50x22c50x2310,
  60.     0x23200x23210x23290x232a0x24690x25000x25020x250c0x25100x2514,
  61.     0x25180x251c0x25240x252c0x25340x253c0x25500x25510x25520x2553,
  62.     0x25540x25550x25560x25570x25580x25590x255a0x255b0x255c0x255d,
  63.     0x255e0x255f0x25600x25610x25620x25630x25640x25650x25660x2567,
  64.     0x25680x25690x256a0x256b0x256c0x25800x25840x25880x258c0x2590,
  65.     0x25910x25920x25930x25a00x25b20x25bc0x25c60x25ca0x25cf0x25d7,
  66.     0x26050x260e0x261b0x261e0x26600x26630x26650x26660x27010x2702,
  67.     0x27030x27040x27060x27070x27080x27090x270c0x270d0x270e0x270f,
  68.     0x27100x27110x27120x27130x27140x27150x27160x27170x27180x2719,
  69.     0x271a0x271b0x271c0x271d0x271e0x271f0x27200x27210x27220x2723,
  70.     0x27240x27250x27260x27270x27290x272a0x272b0x272c0x272d0x272e,
  71.     0x272f0x27300x27310x27320x27330x27340x27350x27360x27370x2738,
  72.     0x27390x273a0x273b0x273c0x273d0x273e0x273f0x27400x27410x2742,
  73.     0x27430x27440x27450x27460x27470x27480x27490x274a0x274b0x274d,
  74.     0x274f0x27500x27510x27520x27560x27580x27590x275a0x275b0x275c,
  75.     0x275d0x275e0x27610x27620x27630x27640x27650x27660x27670x277f,
  76.     0x27890x27930x27940x27980x27990x279a0x279b0x279c0x279d0x279e,
  77.     0x279f0x27a00x27a10x27a20x27a30x27a40x27a50x27a60x27a70x27a8,
  78.     0x27a90x27aa0x27ab0x27ac0x27ad0x27ae0x27af0x27b10x27b20x27b3,
  79.     0x27b40x27b50x27b60x27b70x27b80x27b90x27ba0x27bb0x27bc0x27bd,
  80.     0x27be0xf6d90xf6da0xf6db0xf8d70xf8d80xf8d90xf8da0xf8db0xf8dc,
  81.     0xf8dd0xf8de0xf8df0xf8e00xf8e10xf8e20xf8e30xf8e40xf8e50xf8e6,
  82.     0xf8e70xf8e80xf8e90xf8ea0xf8eb0xf8ec0xf8ed0xf8ee0xf8ef0xf8f0,
  83.     0xf8f10xf8f20xf8f30xf8f40xf8f50xf8f60xf8f70xf8f80xf8f90xf8fa,
  84.     0xf8fb0xf8fc0xf8fd0xf8fe0xfe7c0xfe7d,
  85.             );
  86.         $pattern preg_quote(utf8_from_unicode($UTF8_SPECIAL_CHARS)'/');
  87.         $pattern '/[\x00-\x19'.$pattern.']/u';
  88.     }
  89.     
  90.     return $pattern;
  91. }
  92.  
  93. //--------------------------------------------------------------------
  94. /**
  95. * Checks a string for whether it contains only word characters. This
  96. * is logically equivalent to the \w PCRE meta character. Note that
  97. * this is not a 100% guarantee that the string only contains alpha /
  98. * numeric characters but just that common non-alphanumeric are not
  99. * in the string, including ASCII device control characters.
  100. @package utf8
  101. @subpackage utils
  102. @param string to check
  103. @return boolean TRUE if the string only contains word characters
  104. @see utf8_specials_pattern
  105. */
  106. function utf8_is_word_chars($str{
  107.     return !(bool)preg_match(utf8_specials_pattern(),$str);
  108. }
  109.  
  110. //--------------------------------------------------------------------
  111. /**
  112. * Removes special characters (nonalphanumeric) from a UTF-8 string
  113. * This can be useful as a helper for sanitizing a string for use as
  114. * something like a file name or a unique identifier. Be warned though
  115. * it does not handle all possible non-alphanumeric characters and is
  116. * not intended is some kind of security / injection filter.
  117. *
  118. @package utf8
  119. @subpackage utils
  120. @author Andreas Gohr <andi@splitbrain.org>
  121. @param string $string The UTF8 string to strip of special chars
  122. @param string (optional) $repl   Replace special with this string
  123. @return string with common non-alphanumeric characters removed
  124. @see utf8_specials_pattern
  125. */
  126. function utf8_strip_specials($string$repl=''){
  127.     return preg_replace(utf8_specials_pattern()$repl$string);
  128. }

Documentation generated on Thu, 08 Jan 2009 17:48:33 +0100 by phpDocumentor 1.4.0a2