alphanormalize_php  v.03.00.03 — June 17, 2020
alphanormalize.inc
Go to the documentation of this file.
1 <?php // -*- coding: utf-8 -*-
2 
3 /** \file alphanormalize.inc
4  * (June 17, 2020)
5  *
6  * \brief Main module: Simple functions to remove "accents" and replace non-alphanumeric characters.
7  *
8  * Piece of alphanormalize_php.
9  * https://bitbucket.org/OPiMedia/alphanormalize_php
10  *
11  * LGPL3 --- Copyright (C) 2013, 2016, 2020 Olivier Pirson
12  * http://www.opimedia.be/
13  *
14  * @version 03.00.03 --- June 17, 2020
15  * @author Olivier Pirson <olivier.pirson.opi@gmail.com>
16  * @package Alphanormalize
17  *
18  *
19  * \mainpage alphanormalize_php
20  * Simple functions to remove "accents" and replace non-alphanumeric characters.
21  * See alphanormalize.inc file.
22  *
23  * <ul>
24  * <li>Sources on Bitbucket: <a href="https://bitbucket.org/OPiMedia/alphanormalize_php" target="_blank"><tt>https://bitbucket.org/OPiMedia/alphanormalize_php</tt></a></li>
25  * <li><a href="http://www.opimedia.be/DS/webdev/PHP/alphanormalize-php/docs/" target="_blank">Online HTML documentation</a></li>
26  * <li><a href="http://www.opimedia.be/DS/webdev/PHP/alphanormalize-php/alphanormalize-test.php" target="_blank">Online simple test page</a></li>
27  * </ul>
28  *
29  * The main function is similar to alphanormalize function of the online JavaScript application
30  * <a href="http://www.opimedia.be/DS/online-tools/txt2/" target="_blank"><tt>http://www.opimedia.be/DS/online-tools/txt2/</tt></a>
31  *
32  * <img src="alphanormalize_php_64x64.png" width="64" height="64" alt="[alphanormalize_php]">
33  *
34  * <div>
35  * LGPL
36  * -----
37  * Copyright (C) 2013, 2016, 2020 Olivier Pirson
38  *
39  * This program is free software: you can redistribute it and/or modify
40  * it under the terms of the GNU Lesser General Public License as published by
41  * the Free Software Foundation, either version 3 of the License, or
42  * (at your option) any later version.
43  *
44  * This program is distributed in the hope that it will be useful,
45  * but WITHOUT ANY WARRANTY; without even the implied warranty of
46  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
47  * GNU Lesser General Public License for more details.
48  *
49  * You should have received a copy of the GNU Lesser General Public License
50  * along with this program. If not, see <http://www.gnu.org/licenses/>.
51  * </div>
52  *
53  * <div align="right">
54  * &copy; Olivier <span style="font-variant:small-caps">Pirson</span>\n
55  * <a class="net" href="http://www.opimedia.be/" target="_blank"><tt>www.opimedia.be</tt></a>\n
56  * <a class="mail" href="mailto:olivier.pirson.opi@gmail.com?subject=[alphanormalize_php]"><tt>olivier.pirson.opi@gmail.com</tt></a>
57  * </div>
58  */
59 
60 namespace Alphanormalize;
61 
62 
63 
64 /**
65  * \brief Copy of $s without "accents".
66  *
67  * Returns a copy of which $s "accented" characters were converted by removing their "accent"
68  * (the converted characters are those of the associative table $ACCENTALPHA_TO_ALPHA from accentalpha_to_alpha.inc file).
69  *
70  * For example: <code>'Élément'</code> => <code>'Element'</code>.
71  *
72  * If $encoding === null
73  * then use the internal character encoding.
74  *
75  * @param string $s
76  * @param null|string $encoding
77  *
78  * @return string
79  */
80 function mb_str_accentalpha_to_alpha($s, $encoding=null) {
81  #DEBUG
82  assert('is_string($s)');
83  #DEBUG_END
84 
85  require 'accentalpha_to_alpha.inc';
86 
87  $a = array(); // array of converted characters
88 
89  if ($encoding === null) {
90  $len = mb_strlen($s);
91 
92  for ($i = 0; $i < $len; $i++) {
93  $c = mb_substr($s, $i, 1);
94 
95  array_push($a, (array_key_exists($c, $ACCENTALPHA_TO_ALPHA)
97  : $c));
98  }
99  }
100  else {
101  #DEBUG
102  assert('is_string($encoding)');
103  #DEBUG_END
104 
105  $len = mb_strlen($s, $encoding);
106 
107  for ($i = 0; $i < $len; $i++) {
108  $c = mb_substr($s, $i, 1, $encoding);
109 
110  array_push($a, (array_key_exists($c, $ACCENTALPHA_TO_ALPHA)
112  : $c));
113  }
114  }
115 
116  return implode($a);
117 }
118 
119 
120 /**
121  * \brief
122  * Copy of $s without "accents"
123  * with the characters of the Greek alphabet were replaced
124  * and all non-alphanumeric characters are replaced by $replacement.
125  *
126  * Returns a copy of $s:
127  * - each "accented" characters is converted by removing its "accent"
128  * (the converted characters are those of the associative table $ACCENTALPHA_TO_ALPHA from accentalpha_to_alpha.inc file) ;
129  * - each characters of the Greek alphabet is converted to alphabetic characters
130  * (the converted characters are those of the associative table $GREEK_TO_ALPHA from greek_to_alpha.inc file) ;
131  * - each group of other characters is replaced by $replacement.
132  *
133  * For example: <code>'Élément ; α and ω.'</code> => <code>'Element_a_and_o_'</code>.
134  *
135  * Adopts the standard ONU/ELOT: see http://www.opimedia.be/DS/mementos/grecs.htm .
136  *
137  * If $strip
138  * then begins delete HTML tags.
139  *
140  * If $entity_decode
141  * then begins convert HTML entities to normal characters.
142  * (Previous PHP 5.4, all HTML entities are not supported!)
143  *
144  * If $encoding === null
145  * then use the internal character encoding.
146  *
147  * @param string $s
148  * @param bool $strip
149  * @param bool $entity_decode
150  * @param string $replacement
151  * @param null|string $encoding
152  *
153  * @return string
154  */
155 function mb_str_alphanormalize($s, $strip=false, $entity_decode=false, $replacement='_', $encoding=null) {
156  #DEBUG
157  assert('is_string($s)');
158  assert('is_bool($strip)');
159  assert('is_bool($entity_decode)');
160  assert('is_string($replacement)');
161  #DEBUG_END
162 
163  require 'accentalpha_to_alpha.inc';
164  require 'greek_to_alpha.inc';
165 
166  if ( $strip ) { // delete HTML tags
167  $s = strip_tags($s);
168  }
169 
170  if ( $entity_decode ) { // convert HTML entities to normal characters
171  $s = html_entity_decode($s, ENT_COMPAT, mb_internal_encoding());
172  }
173 
174  $not_consecutive = true; // true if the previous character is not $replacement, else false
175  $a = array(); // array of converted characters
176 
177  if ($encoding === null) {
178  $len = mb_strlen($s);
179 
180  for ($i = 0; $i < $len; $i++) {
181  $c = mb_substr($s, $i, 1);
182 
183  if ((('0' <= $c) && ($c <= '9'))
184  || (('A' <= $c) && ($c <= 'Z'))
185  || (('a' <= $c) && ($c <= 'z'))) { // alphanumeric character
186  array_push($a, $c);
187  $not_consecutive = true;
188  }
189  elseif (array_key_exists($c, $ACCENTALPHA_TO_ALPHA)) { // "accented" character -> 1 or 2 alphabetic characters
190  array_push($a, $ACCENTALPHA_TO_ALPHA[$c]);
191  $not_consecutive = true;
192  }
193  elseif (array_key_exists($c, $GREEK_TO_ALPHA)) { // Greek letter -> 1 or 2 alphabetic characters
194  array_push($a, $GREEK_TO_ALPHA[$c]);
195  $not_consecutive = true;
196  }
197  elseif ($not_consecutive) { // other characters -> $replacement, if not already preceded by a $replacement
198  $not_consecutive = false;
199  array_push($a, $replacement);
200  }
201  }
202  }
203  else {
204  #DEBUG
205  assert('is_string($encoding)');
206  #DEBUG_END
207 
208  $len = mb_strlen($s, $encoding);
209 
210  for ($i = 0; $i < $len; $i++) {
211  $c = mb_substr($s, $i, 1, $encoding);
212 
213  if ((('0' <= $c) && ($c <= '9'))
214  || (('A' <= $c) && ($c <= 'Z'))
215  || (('a' <= $c) && ($c <= 'z'))) { // alphanumeric character
216  array_push($a, $c);
217  $not_consecutive = true;
218  }
219  elseif (array_key_exists($c, $ACCENTALPHA_TO_ALPHA)) { // "accented" character -> 1 or 2 alphabetic characters
220  array_push($a, $ACCENTALPHA_TO_ALPHA[$c]);
221  $not_consecutive = true;
222  }
223  elseif (array_key_exists($c, $GREEK_TO_ALPHA)) { // Greek letter -> 1 or 2 alphabetic characters
224  array_push($a, $GREEK_TO_ALPHA[$c]);
225  $not_consecutive = true;
226  }
227  elseif ($not_consecutive) { // other characters -> $replacement, if not already preceded by a $replacement
228  $not_consecutive = false;
229  array_push($a, $replacement);
230  }
231  }
232  }
233 
234  return implode($a);
235 }
236 
237 
238 /**
239  * \brief Copy of $s with the characters of the Greek alphabet were replaced.
240  *
241  * Returns a copy of $s with the characters of the Greek alphabet were converted to alphabetic characters
242  * (the converted characters are those of the associative table $GREEK_TO_ALPHA from greek_to_alpha.inc file).
243  *
244  * Adopts the standard ONU/ELOT: see http://www.opimedia.be/DS/mementos/grecs.htm .
245  *
246  * For example: <code>'α and ω'</code> => <code>'a and o'</code>.
247  *
248  * If $encoding === null
249  * then use the internal character encoding.
250  *
251  * @param string $s
252  * @param null|string $encoding
253  *
254  * @return string
255  */
256 function mb_str_greek_to_alpha($s, $encoding=null) {
257  #DEBUG
258  assert('is_string($s)');
259  #DEBUG_END
260 
261  require 'greek_to_alpha.inc';
262 
263  $a = array(); // array of converted characters
264 
265  if ($encoding === null) {
266  $len = mb_strlen($s);
267 
268  for ($i = 0; $i < $len; $i++) {
269  $c = mb_substr($s, $i, 1);
270 
271  array_push($a, (array_key_exists($c, $GREEK_TO_ALPHA)
272  ? $GREEK_TO_ALPHA[$c]
273  : $c));
274  }
275  }
276  else {
277  #DEBUG
278  assert('is_string($encoding)');
279  #DEBUG_END
280 
281  $len = mb_strlen($s, $encoding);
282 
283  for ($i = 0; $i < $len; $i++) {
284  $c = mb_substr($s, $i, 1, $encoding);
285 
286  array_push($a, (array_key_exists($c, $GREEK_TO_ALPHA)
287  ? $GREEK_TO_ALPHA[$c]
288  : $c));
289  }
290  }
291 
292  return implode($a);
293 }
294 
295 
296 /**
297  * \brief Return the version of this module.
298  *
299  * @return string
300  */
301 function version() {
302  return '03.00.03 --- June 17, 2020';
303 }
304 
305 
306 return true;
307 
308 ?>
mb_str_accentalpha_to_alpha($s, $encoding=null)
Copy of $s without "accents".
$GREEK_TO_ALPHA
mb_str_greek_to_alpha($s, $encoding=null)
Copy of $s with the characters of the Greek alphabet were replaced.
version()
Return the version of this module.
mb_str_alphanormalize($s, $strip=false, $entity_decode=false, $replacement='_', $encoding=null)
Copy of $s without "accents" with the characters of the Greek alphabet were replaced and all non-alph...
$ACCENTALPHA_TO_ALPHA