]> git.wh0rd.org - tt-rss.git/blob - lib/languagedetect/Text/LanguageDetect/ISO639.php
add text_languagedetect to guess article language for better hyphenation
[tt-rss.git] / lib / languagedetect / Text / LanguageDetect / ISO639.php
1 <?php
2 /**
3 * Part of Text_LanguageDetect
4 *
5 * PHP version 5
6 *
7 * @category Text
8 * @package Text_LanguageDetect
9 * @author Christian Weiske <cweiske@php.net>
10 * @copyright 2011 Christian Weiske <cweiske@php.net>
11 * @license http://www.debian.org/misc/bsd.license BSD
12 * @version SVN: $Id$
13 * @link http://pear.php.net/package/Text_LanguageDetect/
14 */
15
16 /**
17 * Provides a mapping between the languages from lang.dat and the
18 * ISO 639-1 and ISO-639-2 codes.
19 *
20 * Note that this class contains only languages that exist in lang.dat.
21 *
22 * @category Text
23 * @package Text_LanguageDetect
24 * @author Christian Weiske <cweiske@php.net>
25 * @copyright 2011 Christian Weiske <cweiske@php.net>
26 * @license http://www.debian.org/misc/bsd.license BSD
27 * @link http://www.loc.gov/standards/iso639-2/php/code_list.php
28 */
29 class Text_LanguageDetect_ISO639
30 {
31 /**
32 * Maps all language names from the language database to the
33 * ISO 639-1 2-letter language code.
34 *
35 * NULL indicates that there is no 2-letter code.
36 *
37 * @var array
38 */
39 public static $nameToCode2 = array(
40 'albanian' => 'sq',
41 'arabic' => 'ar',
42 'azeri' => 'az',
43 'bengali' => 'bn',
44 'bulgarian' => 'bg',
45 'cebuano' => null,
46 'croatian' => 'hr',
47 'czech' => 'cs',
48 'danish' => 'da',
49 'dutch' => 'nl',
50 'english' => 'en',
51 'estonian' => 'et',
52 'farsi' => 'fa',
53 'finnish' => 'fi',
54 'french' => 'fr',
55 'german' => 'de',
56 'hausa' => 'ha',
57 'hawaiian' => null,
58 'hindi' => 'hi',
59 'hungarian' => 'hu',
60 'icelandic' => 'is',
61 'indonesian' => 'id',
62 'italian' => 'it',
63 'kazakh' => 'kk',
64 'kyrgyz' => 'ky',
65 'latin' => 'la',
66 'latvian' => 'lv',
67 'lithuanian' => 'lt',
68 'macedonian' => 'mk',
69 'mongolian' => 'mn',
70 'nepali' => 'ne',
71 'norwegian' => 'no',
72 'pashto' => 'ps',
73 'pidgin' => null,
74 'polish' => 'pl',
75 'portuguese' => 'pt',
76 'romanian' => 'ro',
77 'russian' => 'ru',
78 'serbian' => 'sr',
79 'slovak' => 'sk',
80 'slovene' => 'sl',
81 'somali' => 'so',
82 'spanish' => 'es',
83 'swahili' => 'sw',
84 'swedish' => 'sv',
85 'tagalog' => 'tl',
86 'turkish' => 'tr',
87 'ukrainian' => 'uk',
88 'urdu' => 'ur',
89 'uzbek' => 'uz',
90 'vietnamese' => 'vi',
91 'welsh' => 'cy',
92 );
93
94 /**
95 * Maps all language names from the language database to the
96 * ISO 639-2 3-letter language code.
97 *
98 * @var array
99 */
100 public static $nameToCode3 = array(
101 'albanian' => 'sqi',
102 'arabic' => 'ara',
103 'azeri' => 'aze',
104 'bengali' => 'ben',
105 'bulgarian' => 'bul',
106 'cebuano' => 'ceb',
107 'croatian' => 'hrv',
108 'czech' => 'ces',
109 'danish' => 'dan',
110 'dutch' => 'nld',
111 'english' => 'eng',
112 'estonian' => 'est',
113 'farsi' => 'fas',
114 'finnish' => 'fin',
115 'french' => 'fra',
116 'german' => 'deu',
117 'hausa' => 'hau',
118 'hawaiian' => 'haw',
119 'hindi' => 'hin',
120 'hungarian' => 'hun',
121 'icelandic' => 'isl',
122 'indonesian' => 'ind',
123 'italian' => 'ita',
124 'kazakh' => 'kaz',
125 'kyrgyz' => 'kir',
126 'latin' => 'lat',
127 'latvian' => 'lav',
128 'lithuanian' => 'lit',
129 'macedonian' => 'mkd',
130 'mongolian' => 'mon',
131 'nepali' => 'nep',
132 'norwegian' => 'nor',
133 'pashto' => 'pus',
134 'pidgin' => 'crp',
135 'polish' => 'pol',
136 'portuguese' => 'por',
137 'romanian' => 'ron',
138 'russian' => 'rus',
139 'serbian' => 'srp',
140 'slovak' => 'slk',
141 'slovene' => 'slv',
142 'somali' => 'som',
143 'spanish' => 'spa',
144 'swahili' => 'swa',
145 'swedish' => 'swe',
146 'tagalog' => 'tgl',
147 'turkish' => 'tur',
148 'ukrainian' => 'ukr',
149 'urdu' => 'urd',
150 'uzbek' => 'uzb',
151 'vietnamese' => 'vie',
152 'welsh' => 'cym',
153 );
154
155 /**
156 * Maps ISO 639-1 2-letter language codes to the language names
157 * in the language database
158 *
159 * Not all languages have a 2 letter code, so some are missing
160 *
161 * @var array
162 */
163 public static $code2ToName = array(
164 'ar' => 'arabic',
165 'az' => 'azeri',
166 'bg' => 'bulgarian',
167 'bn' => 'bengali',
168 'cs' => 'czech',
169 'cy' => 'welsh',
170 'da' => 'danish',
171 'de' => 'german',
172 'en' => 'english',
173 'es' => 'spanish',
174 'et' => 'estonian',
175 'fa' => 'farsi',
176 'fi' => 'finnish',
177 'fr' => 'french',
178 'ha' => 'hausa',
179 'hi' => 'hindi',
180 'hr' => 'croatian',
181 'hu' => 'hungarian',
182 'id' => 'indonesian',
183 'is' => 'icelandic',
184 'it' => 'italian',
185 'kk' => 'kazakh',
186 'ky' => 'kyrgyz',
187 'la' => 'latin',
188 'lt' => 'lithuanian',
189 'lv' => 'latvian',
190 'mk' => 'macedonian',
191 'mn' => 'mongolian',
192 'ne' => 'nepali',
193 'nl' => 'dutch',
194 'no' => 'norwegian',
195 'pl' => 'polish',
196 'ps' => 'pashto',
197 'pt' => 'portuguese',
198 'ro' => 'romanian',
199 'ru' => 'russian',
200 'sk' => 'slovak',
201 'sl' => 'slovene',
202 'so' => 'somali',
203 'sq' => 'albanian',
204 'sr' => 'serbian',
205 'sv' => 'swedish',
206 'sw' => 'swahili',
207 'tl' => 'tagalog',
208 'tr' => 'turkish',
209 'uk' => 'ukrainian',
210 'ur' => 'urdu',
211 'uz' => 'uzbek',
212 'vi' => 'vietnamese',
213 );
214
215 /**
216 * Maps ISO 639-2 3-letter language codes to the language names
217 * in the language database.
218 *
219 * @var array
220 */
221 public static $code3ToName = array(
222 'ara' => 'arabic',
223 'aze' => 'azeri',
224 'ben' => 'bengali',
225 'bul' => 'bulgarian',
226 'ceb' => 'cebuano',
227 'ces' => 'czech',
228 'crp' => 'pidgin',
229 'cym' => 'welsh',
230 'dan' => 'danish',
231 'deu' => 'german',
232 'eng' => 'english',
233 'est' => 'estonian',
234 'fas' => 'farsi',
235 'fin' => 'finnish',
236 'fra' => 'french',
237 'hau' => 'hausa',
238 'haw' => 'hawaiian',
239 'hin' => 'hindi',
240 'hrv' => 'croatian',
241 'hun' => 'hungarian',
242 'ind' => 'indonesian',
243 'isl' => 'icelandic',
244 'ita' => 'italian',
245 'kaz' => 'kazakh',
246 'kir' => 'kyrgyz',
247 'lat' => 'latin',
248 'lav' => 'latvian',
249 'lit' => 'lithuanian',
250 'mkd' => 'macedonian',
251 'mon' => 'mongolian',
252 'nep' => 'nepali',
253 'nld' => 'dutch',
254 'nor' => 'norwegian',
255 'pol' => 'polish',
256 'por' => 'portuguese',
257 'pus' => 'pashto',
258 'rom' => 'romanian',
259 'rus' => 'russian',
260 'slk' => 'slovak',
261 'slv' => 'slovene',
262 'som' => 'somali',
263 'spa' => 'spanish',
264 'sqi' => 'albanian',
265 'srp' => 'serbian',
266 'swa' => 'swahili',
267 'swe' => 'swedish',
268 'tgl' => 'tagalog',
269 'tur' => 'turkish',
270 'ukr' => 'ukrainian',
271 'urd' => 'urdu',
272 'uzb' => 'uzbek',
273 'vie' => 'vietnamese',
274 );
275
276 /**
277 * Returns the 2-letter ISO 639-1 code for the given language name.
278 *
279 * @param string $lang English language name like "swedish"
280 *
281 * @return string Two-letter language code (e.g. "sv") or NULL if not found
282 */
283 public static function nameToCode2($lang)
284 {
285 $lang = strtolower($lang);
286 if (!isset(self::$nameToCode2[$lang])) {
287 return null;
288 }
289 return self::$nameToCode2[$lang];
290 }
291
292 /**
293 * Returns the 3-letter ISO 639-2 code for the given language name.
294 *
295 * @param string $lang English language name like "swedish"
296 *
297 * @return string Three-letter language code (e.g. "swe") or NULL if not found
298 */
299 public static function nameToCode3($lang)
300 {
301 $lang = strtolower($lang);
302 if (!isset(self::$nameToCode3[$lang])) {
303 return null;
304 }
305 return self::$nameToCode3[$lang];
306 }
307
308 /**
309 * Returns the language name for the given 2-letter ISO 639-1 code.
310 *
311 * @param string $code Two-letter language code (e.g. "sv")
312 *
313 * @return string English language name like "swedish"
314 */
315 public static function code2ToName($code)
316 {
317 $lang = strtolower($code);
318 if (!isset(self::$code2ToName[$code])) {
319 return null;
320 }
321 return self::$code2ToName[$code];
322 }
323
324 /**
325 * Returns the language name for the given 3-letter ISO 639-2 code.
326 *
327 * @param string $code Three-letter language code (e.g. "swe")
328 *
329 * @return string English language name like "swedish"
330 */
331 public static function code3ToName($code)
332 {
333 $lang = strtolower($code);
334 if (!isset(self::$code3ToName[$code])) {
335 return null;
336 }
337 return self::$code3ToName[$code];
338 }
339 }
340
341 ?>