1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """
24 Arabic module
25 """
26 import re
28 """
29 the arabic chars contains all arabic letters, a sub class of unicode,
30 """
31
32 COMMA = u'\u060C'
33 SEMICOLON = u'\u061B'
34 QUESTION = u'\u061F'
35 HAMZA = u'\u0621'
36 ALEF_MADDA = u'\u0622'
37 ALEF_HAMZA_ABOVE = u'\u0623'
38 WAW_HAMZA = u'\u0624'
39 ALEF_HAMZA_BELOW = u'\u0625'
40 YEH_HAMZA = u'\u0626'
41 ALEF = u'\u0627'
42 BEH = u'\u0628'
43 TEH_MARBUTA = u'\u0629'
44 TEH = u'\u062a'
45 THEH = u'\u062b'
46 JEEM = u'\u062c'
47 HAH = u'\u062d'
48 KHAH = u'\u062e'
49 DAL = u'\u062f'
50 THAL = u'\u0630'
51 REH = u'\u0631'
52 ZAIN = u'\u0632'
53 SEEN = u'\u0633'
54 SHEEN = u'\u0634'
55 SAD = u'\u0635'
56 DAD = u'\u0636'
57 TAH = u'\u0637'
58 ZAH = u'\u0638'
59 AIN = u'\u0639'
60 GHAIN = u'\u063a'
61 TATWEEL = u'\u0640'
62 FEH = u'\u0641'
63 QAF = u'\u0642'
64 KAF = u'\u0643'
65 LAM = u'\u0644'
66 MEEM = u'\u0645'
67 NOON = u'\u0646'
68 HEH = u'\u0647'
69 WAW = u'\u0648'
70 ALEF_MAKSURA = u'\u0649'
71 YEH = u'\u064a'
72 MADDA_ABOVE = u'\u0653'
73 HAMZA_ABOVE = u'\u0654'
74 HAMZA_BELOW = u'\u0655'
75 ZERO = u'\u0660'
76 ONE = u'\u0661'
77 TWO = u'\u0662'
78 THREE = u'\u0663'
79 FOUR = u'\u0664'
80 FIVE = u'\u0665'
81 SIX = u'\u0666'
82 SEVEN = u'\u0667'
83 EIGHT = u'\u0668'
84 NINE = u'\u0669'
85 PERCENT = u'\u066a'
86 DECIMAL = u'\u066b'
87 THOUSANDS = u'\u066c'
88 STAR = u'\u066d'
89 MINI_ALEF = u'\u0670'
90 ALEF_WASLA = u'\u0671'
91 FULL_STOP = u'\u06d4'
92 BYTE_ORDER_MARK = u'\ufeff'
93
94
95 FATHATAN = u'\u064b'
96 DAMMATAN = u'\u064c'
97 KASRATAN = u'\u064d'
98 FATHA = u'\u064e'
99 DAMMA = u'\u064f'
100 KASRA = u'\u0650'
101 SHADDA = u'\u0651'
102 SUKUN = u'\u0652'
103
104
105 SMALL_ALEF=u"\u0670"
106 SMALL_WAW=u"\u06E5"
107 SMALL_YEH=u"\u06E6"
108
109 LAM_ALEF=u'\ufefb'
110 LAM_ALEF_HAMZA_ABOVE=u'\ufef7'
111 LAM_ALEF_HAMZA_BELOW=u'\ufef9'
112 LAM_ALEF_MADDA_ABOVE=u'\ufef5'
113 simple_LAM_ALEF=u'\u0644\u0627'
114 simple_LAM_ALEF_HAMZA_ABOVE=u'\u0644\u0623'
115 simple_LAM_ALEF_HAMZA_BELOW=u'\u0644\u0625'
116 simple_LAM_ALEF_MADDA_ABOVE=u'\u0644\u0622'
117
118 TASHKEEL =(FATHATAN, DAMMATAN, KASRATAN,
119 FATHA,DAMMA,KASRA,
120 SUKUN,
121 SHADDA);
122 HARAKAT =(FATHATAN,DAMMATAN,KASRATAN,
123 FATHA,DAMMA,KASRA,
124 SUKUN
125 );
126 SHORTHARAKAT =( FATHA,DAMMA,KASRA, SUKUN);
127
128 TANWIN =(FATHATAN,DAMMATAN,KASRATAN);
129
130 SHORTHARAKAT =( FATHA,DAMMA,KASRA, SUKUN);
131
132 LIGUATURES=(
133 LAM_ALEF,
134 LAM_ALEF_HAMZA_ABOVE,
135 LAM_ALEF_HAMZA_BELOW,
136 LAM_ALEF_MADDA_ABOVE,
137 );
138 HAMZAT=(
139 HAMZA,
140 WAW_HAMZA,
141 YEH_HAMZA,
142 HAMZA_ABOVE,
143 HAMZA_BELOW,
144 ALEF_HAMZA_BELOW,
145 ALEF_HAMZA_ABOVE,
146 );
147 ALEFAT=(
148 ALEF,
149 ALEF_MADDA,
150 ALEF_HAMZA_ABOVE,
151 ALEF_HAMZA_BELOW,
152 ALEF_WASLA,
153 ALEF_MAKSURA,
154 SMALL_ALEF,
155
156 );
157 WEAK = ( ALEF, WAW, YEH, ALEF_MAKSURA);
158 YEHLIKE= (
159 YEH,
160 YEH_HAMZA,
161 ALEF_MAKSURA,
162 SMALL_YEH,
163 );
164
165 WAWLIKE=(
166 WAW,
167 WAW_HAMZA,
168 SMALL_WAW,
169 );
170 TEHLIKE=(
171 TEH,
172 TEH_MARBUTA,
173 );
174 MOON=(
175 TEH,
176 TEH_MARBUTA,
177 );
178 SMALL=(
179 TEH,
180 TEH_MARBUTA,
181 );
182 MOON=(
183 TEH,
184 TEH_MARBUTA,
185 );
186 SUN=(
187 TEH,
188 TEH_MARBUTA,
189 );
190
191 HARAKAT =re.compile(ur"^[%s%s%s%s%s%s%s]$"%(FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN) )
192
193 HARAKAT_NO_SHADDA_pat =re.compile(ur"^[%s%s%s%s%s%s%s]$"%(FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN) )
194
195
196 HARAKAT_pat =re.compile(ur"^[%s%s%s%s%s%s%s]$"%(FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN) )
197 TASHKEEL_pat =re.compile(ur"^[%s%s%s%s%s%s%s%s]$"%(FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN,SHADDA) )
198
199 HARAKAT_NO_SHADDA_pat =re.compile(ur"^[%s%s%s%s%s%s%s]$"%(FATHATAN,DAMMATAN,KASRATAN,FATHA,DAMMA,KASRA,SUKUN) )
200
204 """ Checks for an Arabic Unicode block characters;
205 @param text: input text
206 @type text: unicode
207 @return: True if all charaters are in Arabic block
208 @rtype: Boolean
209 """
210 pass;
211 if len(word)==0: return False;
212 word_nm=ar_strip_marks_keepshadda(word);
213
214 word_nm=word_nm.replace(ALEF_MADDA,HAMZA+ALEF);
215 if word[0] in (WAW_HAMZA,YEH_HAMZA,FATHA,DAMMA,SUKUN,KASRA):
216 return False;
217
218 if re.match(u"^(.)*[%s](.)+$"%ALEF_MAKSURA,word):
219 return False;
220 if re.match(u"^(.)*[%s]([^%s%s%s])(.)+$"%(TEH_MARBUTA,DAMMA,KASRA,FATHA),word):
221 return False;
222
223
224 if re.search(u"([^\u0621-\u0652%s%s%s])"%(LAM_ALEF, LAM_ALEF_HAMZA_ABOVE,LAM_ALEF_MADDA_ABOVE),word):
225 return False;
226 if re.match(u"([\d])+",word):
227 return False;
228 return True;
229
231 """ Checks for an valid Arabic word.
232 An Arabic word
233 @param word: input word
234 @type word: unicode
235 @return: True if all charaters are in Arabic block
236 @rtype: Boolean
237 """
238 pass;
239
240
241
243 """Checks for Arabic Sukun Mark.
244 @param archar: arabic unicode char
245 @type archar: unicode
246 """
247 if archar==self.SUKUN:
248 return True;
249 else: return False;
250
252 """Checks for Arabic Shadda Mark.
253 @param archar: arabic unicode char
254 @type archar: unicode
255 """
256 if archar==self.SHADDA:
257 return True;
258 else: return False;
259
261 """Checks for Arabic Tatweel letter modifier.
262 @param archar: arabic unicode char
263 @type archar: unicode
264 """
265 if archar==self.TATWEEL:
266 return True;
267 else: return False;
269 """Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN).
270 @param archar: arabic unicode char
271 @type archar: unicode
272 """
273 if archar in self.TANWIN:
274 return True;
275 else: return False;
276
278 """Checks for Arabic Tashkeel Marks (FATHA,DAMMA,KASRA, SUKUN, SHADDA, FATHATAN,DAMMATAN, KASRATAn).
279 @param archar: arabic unicode char
280 @type archar: unicode
281 """
282
283 if archar in self.TASHKEEL:
284 return True;
285 else: return False;
286
288 """Checks for Arabic Harakat Marks (FATHA,DAMMA,KASRA,SUKUN,TANWIN).
289 @param archar: arabic unicode char
290 @type archar: unicode
291 """
292 if archar in self.HARAKAT:
293 return True;
294 else: return False;
295
297 """Checks for Arabic short Harakat Marks (FATHA,DAMMA,KASRA,SUKUN).
298 @param archar: arabic unicode char
299 @type archar: unicode
300 """
301 if archar in self.SHORTHARAKAT:
302 return True;
303 else: return False;
304
306 """Checks for Arabic Ligatures like LamAlef.
307 (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE)
308 @param archar: arabic unicode char
309 @type archar: unicode
310 """
311 if archar in self.LIGUATURES:
312 return True;
313 else: return False;
314
316 """Checks for Arabic Hamza forms.
317 HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE )
318 @param archar: arabic unicode char
319 @type archar: unicode
320 """
321 if archar in self.HAMZAT:
322 return True;
323 else: return False;
324
326 """Checks for Arabic Alef forms.
327 ALEFAT=(ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW,ALEF_WASLA, ALEF_MAKSURA );
328 @param archar: arabic unicode char
329 @type archar: unicode
330 """
331 if archar in self.ALEFAT:
332 return True;
333 else: return False;
334
336 """Checks for Arabic Yeh forms.
337 Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA
338 @param archar: arabic unicode char
339 @type archar: unicode
340 """
341 if archar in self.YEHLIKE:
342 return True;
343 else: return False;
344
346 """Checks for Arabic Waw like forms.
347 Waw forms : WAW, WAW_HAMZA, SMALL_WAW
348 @param archar: arabic unicode char
349 @type archar: unicode
350 """
351 if archar in self.WAWLIKE:
352 return True;
353 else: return False;
354
356 """Checks for Arabic Teh forms.
357 Teh forms : TEH, TEH_MARBUTA
358 @param archar: arabic unicode char
359 @type archar: unicode
360 """
361 if archar in self.TEHLIKE:
362 return True;
363 else: return False;
365 """Checks for Arabic Small letters.
366 SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH
367 @param archar: arabic unicode char
368 @type archar: unicode
369 """
370 if archar in self.SMALL:
371 return True;
372 else: return False;
373
375 """Checks for Arabic Weak letters.
376 Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA
377 @param archar: arabic unicode char
378 @type archar: unicode
379 """
380 if archar in self.WEAK:
381 return True;
382 else: return False;
383
384
385
386
387
389 """Checks if the arabic word contains shadda.
390 @param word: arabic unicode char
391 @type word: unicode
392 """
393 if re.search(self.SHADDA,word):
394 return True;
395 else:
396 return False;
397
399 """Checks if the arabic word contains harakat ( FATHA, DAMMA, KASRA,.
400 @param word: arabic unicode char
401 @type word: unicode
402 """
403 if re.search(self.HARAKAT_pat,word):
404 return True;
405 else:
406 return False;
407
408
409
411 """Checks if the arabic word is vocalized.
412 the word musn't have any spaces and pounctuations.
413 @param word: arabic unicode char
414 @type word: unicode
415 """
416 if word.isalpha(): return False;
417
418 else:
419 if re.search(self.HARAKAT_pat,word):
420 return True;
421 else:
422 return False;
423 - def isVocalizedtext(self,text):
424 """Checks if the arabic text is vocalized.
425 The text can contain many words and spaces
426 @param text: arabic unicode char
427 @type text: unicode
428 """
429 if re.search(self.HARAKAT_pat,word):
430 return True;
431 else:
432 return False;
433
434
435
437 """Strip Harakat from arabic word except Shadda.
438 Harakat doesn't contain Shdda.
439 to strip all Harakat and Shadda, use stripTashkeel function.
440 @param word: arabic unicode char
441 @type word: unicode
442 """
443 return re.sub(self.HARAKAT_pat,u'',word)
444
446 """Strip Tashkeel from arabic word.
447 Tashkeel contains (Harakat and Shadda)
448 to strip all Harakat and Shadda, use stripTashkeel function.
449 @param word: arabic unicode char
450 @type word: unicode
451 """
452 return re.sub(self.TASHKEEL_pat,'',word);
453
455 """Strip Tatweel (Kashida) from arabic word.
456 @param word: arabic unicode char
457 @type word: unicode
458 """
459 return re.sub(self.TATWEEL,'',word);
460