Package pyarabic :: Module archars :: Variable arabicchars :: Class arabicchars
[hide private]
[frames] | no frames]

Class arabicchars

source code

the arabic chars contains all arabic letters, a sub class of unicode,

Instance Methods [hide private]
 
__init__() source code
    is letter functions
 
isSukun(self, archar)
Checks for Arabic Sukun Mark.
source code
 
isShadda(self, archar)
Checks for Arabic Shadda Mark.
source code
 
isTatweel(self, archar)
Checks for Arabic Tatweel letter modifier.
source code
 
isTanwin(self, archar)
Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN).
source code
 
isTashkeel(self, archar)
Checks for Arabic Tashkeel Marks (FATHA,DAMMA,KASRA, SUKUN, SHADDA, FATHATAN,DAMMATAN, KASRATAn).
source code
 
isHaraka(self, archar)
Checks for Arabic Harakat Marks (FATHA,DAMMA,KASRA,SUKUN,TANWIN).
source code
 
isShortharaka(self, archar)
Checks for Arabic short Harakat Marks (FATHA,DAMMA,KASRA,SUKUN).
source code
 
isLigature(self, archar)
Checks for Arabic Ligatures like LamAlef.
source code
 
isHamza(self, archar)
Checks for Arabic Hamza forms.
source code
 
isAlef(self, archar)
Checks for Arabic Alef forms.
source code
 
isYehlike(self, archar)
Checks for Arabic Yeh forms.
source code
 
isWawlike(self, archar)
Checks for Arabic Waw like forms.
source code
 
isTeh(self, archar)
Checks for Arabic Teh forms.
source code
 
isSmall(self, archar)
Checks for Arabic Small letters.
source code
 
isWeak(self, archar)
Checks for Arabic Weak letters.
source code
 
isMoon(self, archar)
Checks for Arabic Moon letters.
source code
 
isSun(self, archar)
Checks for Arabic Sun letters.
source code
    general letter functions
integer;
order(self, archar)
return Arabic letter order between 1 and 29.
source code
unicode;
name(self, archar)
return Arabic letter name in arabic.
source code
unicode;
range(self)
return a list of arabic characteres .
source code
    Has letter functions
 
hasShadda(self, word)
Checks if the arabic word contains shadda.
source code
    word and text functions
 
isVocalized(self, word)
Checks if the arabic word is vocalized.
source code
 
isVocalizedtext(self, text)
Checks if the arabic text is vocalized.
source code
Boolean
isArabicstring(self, text)
Checks for an Arabic Unicode block characters;
source code
Boolean
isArabicword(self, word)
Checks for an valid Arabic word.
source code
    Strip functions
unicode.
stripHarakat(self, text)
Strip Harakat from arabic word except Shadda.
source code
unicode.
stripTashkeel(self, text)
Strip vowels from a text, include Shadda.
source code
unicode.
stripTatweel(self, text)
Strip tatweel from a text and return a result text.
source code
unicode.
normalizeLigature(self, text)
Normalize Lam Alef ligatures into two letters (LAM and ALEF), and Tand return a result text.
source code
unicode.
vocalizedlike(self, word, vocalized)
return True if the given word have the same or the partial vocalisation like the pattern vocalized
source code
Class Variables [hide private]
  COMMA = u'\u060C'
  SEMICOLON = u'\u061B'
  QUESTION = u'\u061F'
  HAMZA = u'\u0621'
  ALEF_MADDA = u'\u0622'
  ALEF_HAMZA_ABOVE = u'\u0623'
  WAW_HAMZA = u'\u0624'
  ALEF_HAMZA_BELOW = u'\u0625'
  YEH_HAMZA = u'\u0626'
  ALEF = u'\u0627'
  BEH = u'\u0628'
  TEH_MARBUTA = u'\u0629'
  TEH = u'\u062a'
  THEH = u'\u062b'
  JEEM = u'\u062c'
  HAH = u'\u062d'
  KHAH = u'\u062e'
  DAL = u'\u062f'
  THAL = u'\u0630'
  REH = u'\u0631'
  ZAIN = u'\u0632'
  SEEN = u'\u0633'
  SHEEN = u'\u0634'
  SAD = u'\u0635'
  DAD = u'\u0636'
  TAH = u'\u0637'
  ZAH = u'\u0638'
  AIN = u'\u0639'
  GHAIN = u'\u063a'
  TATWEEL = u'\u0640'
  FEH = u'\u0641'
  QAF = u'\u0642'
  KAF = u'\u0643'
  LAM = u'\u0644'
  MEEM = u'\u0645'
  NOON = u'\u0646'
  HEH = u'\u0647'
  WAW = u'\u0648'
  ALEF_MAKSURA = u'\u0649'
  YEH = u'\u064a'
  MADDA_ABOVE = u'\u0653'
  HAMZA_ABOVE = u'\u0654'
  HAMZA_BELOW = u'\u0655'
  ZERO = u'\u0660'
  ONE = u'\u0661'
  TWO = u'\u0662'
  THREE = u'\u0663'
  FOUR = u'\u0664'
  FIVE = u'\u0665'
  SIX = u'\u0666'
  SEVEN = u'\u0667'
  EIGHT = u'\u0668'
  NINE = u'\u0669'
  PERCENT = u'\u066a'
  DECIMAL = u'\u066b'
  THOUSANDS = u'\u066c'
  STAR = u'\u066d'
  MINI_ALEF = u'\u0670'
  ALEF_WASLA = u'\u0671'
  FULL_STOP = u'\u06d4'
  BYTE_ORDER_MARK = u'\ufeff'
  FATHATAN = u'\u064b'
  DAMMATAN = u'\u064c'
  KASRATAN = u'\u064d'
  FATHA = u'\u064e'
  DAMMA = u'\u064f'
  KASRA = u'\u0650'
  SHADDA = u'\u0651'
  SUKUN = u'\u0652'
  SMALL_ALEF = u"\u0670"
  SMALL_WAW = u"\u06E5"
  SMALL_YEH = u"\u06E6"
  LAM_ALEF = u'\ufefb'
  LAM_ALEF_HAMZA_ABOVE = u'\ufef7'
  LAM_ALEF_HAMZA_BELOW = u'\ufef9'
  LAM_ALEF_MADDA_ABOVE = u'\ufef5'
  simple_LAM_ALEF = u'\u0644\u0627'
  simple_LAM_ALEF_HAMZA_ABOVE = u'\u0644\u0623'
  simple_LAM_ALEF_HAMZA_BELOW = u'\u0644\u0625'
  simple_LAM_ALEF_MADDA_ABOVE = u'\u0644\u0622'
  LETTERS = u''.join([ALEF, BEH, TEH, TEH_MARBUTA, THEH, JEEM, H...
  TASHKEEL = FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, ...
  HARAKAT = FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, S...
  SHORTHARAKAT = FATHA, DAMMA, KASRA, SUKUN
  TANWIN = FATHATAN, DAMMATAN, KASRATAN
  LIGUATURES = LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BE...
  HAMZAT = HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW...
  ALEFAT = ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW,...
  WEAK = ALEF, WAW, YEH, ALEF_MAKSURA
  YEHLIKE = YEH, YEH_HAMZA, ALEF_MAKSURA, SMALL_YEH
  WAWLIKE = WAW, WAW_HAMZA, SMALL_WAW
  TEHLIKE = TEH, TEH_MARBUTA
  SMALL = SAMLL_ALEF, SMALL_WAW, SMALL_YEH
  MOON = HAMZA, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, ...
  SUN = TEH, THEH, DAL, THAL, REH, ZAIN, SEEN, SHEEN, SAD, DAD, ...
  AlphabeticOrder = {ALEF: 1, BEH: 2, TEH: 3, TEH_MARBUTA: 3, TH...
  HARAKAT_pattern = re.compile(ur"["+ u"".join(TASHKEEL)+ u"]")
  HAMZAT_pattern = re.compile(ur"["+ u"".join(HAMZAT)+ u"]")
  ALEFAT_pattern = re.compile(ur"["+ u"".join(ALEFAT)+ u"]")
  LIGUATURES_pattern = re.compile(ur"["+ u"".join(LIGATURES)+ u"]")
Method Details [hide private]

isSukun(self, archar)

source code 

Checks for Arabic Sukun Mark.

Parameters:
  • archar (unicode) - arabic unicode char

isShadda(self, archar)

source code 

Checks for Arabic Shadda Mark.

Parameters:
  • archar (unicode) - arabic unicode char

isTatweel(self, archar)

source code 

Checks for Arabic Tatweel letter modifier.

Parameters:
  • archar (unicode) - arabic unicode char

isTanwin(self, archar)

source code 

Checks for Arabic Tanwin Marks (FATHATAN, DAMMATAN, KASRATAN).

Parameters:
  • archar (unicode) - arabic unicode char

isTashkeel(self, archar)

source code 

Checks for Arabic Tashkeel Marks (FATHA,DAMMA,KASRA, SUKUN, SHADDA, FATHATAN,DAMMATAN, KASRATAn).

Parameters:
  • archar (unicode) - arabic unicode char

isHaraka(self, archar)

source code 

Checks for Arabic Harakat Marks (FATHA,DAMMA,KASRA,SUKUN,TANWIN).

Parameters:
  • archar (unicode) - arabic unicode char

isShortharaka(self, archar)

source code 

Checks for Arabic short Harakat Marks (FATHA,DAMMA,KASRA,SUKUN).

Parameters:
  • archar (unicode) - arabic unicode char

isLigature(self, archar)

source code 

Checks for Arabic Ligatures like LamAlef. (LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE)

Parameters:
  • archar (unicode) - arabic unicode char

isHamza(self, archar)

source code 

Checks for Arabic Hamza forms. HAMZAT are (HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW,ALEF_HAMZA_BELOW, ALEF_HAMZA_ABOVE )

Parameters:
  • archar (unicode) - arabic unicode char

isAlef(self, archar)

source code 

Checks for Arabic Alef forms. ALEFAT=(ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW,ALEF_WASLA, ALEF_MAKSURA );

Parameters:
  • archar (unicode) - arabic unicode char

isYehlike(self, archar)

source code 

Checks for Arabic Yeh forms. Yeh forms : YEH, YEH_HAMZA, SMALL_YEH, ALEF_MAKSURA

Parameters:
  • archar (unicode) - arabic unicode char

isWawlike(self, archar)

source code 

Checks for Arabic Waw like forms. Waw forms : WAW, WAW_HAMZA, SMALL_WAW

Parameters:
  • archar (unicode) - arabic unicode char

isTeh(self, archar)

source code 

Checks for Arabic Teh forms. Teh forms : TEH, TEH_MARBUTA

Parameters:
  • archar (unicode) - arabic unicode char

isSmall(self, archar)

source code 

Checks for Arabic Small letters. SMALL Letters : SMALL ALEF, SMALL WAW, SMALL YEH

Parameters:
  • archar (unicode) - arabic unicode char

isWeak(self, archar)

source code 

Checks for Arabic Weak letters. Weak Letters : ALEF, WAW, YEH, ALEF_MAKSURA

Parameters:
  • archar (unicode) - arabic unicode char

isMoon(self, archar)

source code 

Checks for Arabic Moon letters. Moon Letters :

Parameters:
  • archar (unicode) - arabic unicode char

isSun(self, archar)

source code 

Checks for Arabic Sun letters. Moon Letters :

Parameters:
  • archar (unicode) - arabic unicode char

order(self, archar)

source code 

return Arabic letter order between 1 and 29. Alef order is 1, Yeh is 28, Hamza is 29. Teh Marbuta has the same ordre with Teh, 3.

Parameters:
  • archar (unicode) - arabic unicode char
Returns: integer;
arabic order.

name(self, archar)

source code 

return Arabic letter name in arabic. Alef order is 1, Yeh is 28, Hamza is 29. Teh Marbuta has the same ordre with Teh, 3.

Parameters:
  • archar (unicode) - arabic unicode char
Returns: unicode;
arabic name.

range(self)

source code 

return a list of arabic characteres . Return a list of characteres between \u060c to \u0652

Returns: unicode;
list of arabic characteres.

hasShadda(self, word)

source code 

Checks if the arabic word contains shadda.

Parameters:
  • word (unicode) - arabic unicode char

isVocalized(self, word)

source code 

Checks if the arabic word is vocalized. the word musn't have any spaces and pounctuations.

Parameters:
  • word (unicode) - arabic unicode char

isVocalizedtext(self, text)

source code 

Checks if the arabic text is vocalized. The text can contain many words and spaces

Parameters:
  • text (unicode) - arabic unicode char

isArabicstring(self, text)

source code 

Checks for an Arabic Unicode block characters;

Parameters:
  • text (unicode) - input text
Returns: Boolean
True if all charaters are in Arabic block

isArabicword(self, word)

source code 

Checks for an valid Arabic word. An Arabic word

Parameters:
  • word (unicode) - input word
Returns: Boolean
True if all charaters are in Arabic block

stripHarakat(self, text)

source code 

Strip Harakat from arabic word except Shadda. The striped marks are :

  • FATHA, DAMMA, KASRA
  • SUKUN
  • FATHATAN, DAMMATAN, KASRATAN, , , .

Example:

>>> text=u"الْعَرَبِيّةُ"
>>> stripTashkeel(text)
العربيّة
Parameters:
  • text (unicode.) - arabic text.
Returns: unicode.
return a striped text.

stripTashkeel(self, text)

source code 

Strip vowels from a text, include Shadda. The striped marks are :

  • FATHA, DAMMA, KASRA
  • SUKUN
  • SHADDA
  • FATHATAN, DAMMATAN, KASRATAN, , , .

Example:

>>> text=u"الْعَرَبِيّةُ"
>>> stripTashkeel(text)
العربية
Parameters:
  • text (unicode.) - arabic text.
Returns: unicode.
return a striped text.

stripTatweel(self, text)

source code 

Strip tatweel from a text and return a result text.

Example:

>>> text=u"العـــــربية"
>>> stripTatweel(text)
العربية
Parameters:
  • text (unicode.) - arabic text.
Returns: unicode.
return a striped text.

normalizeLigature(self, text)

source code 

Normalize Lam Alef ligatures into two letters (LAM and ALEF), and Tand return a result text. Some systems present lamAlef ligature as a single letter, this function convert it into two letters, The converted letters into LAM and ALEF are :

  • LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_ABOVE

Example:

>>> text=u"لانها لالء الاسلام"
>>> normalize_lamalef(text)
لانها لالئ الاسلام
Parameters:
  • text (unicode.) - arabic text.
Returns: unicode.
return a converted text.

vocalizedlike(self, word, vocalized)

source code 

return True if the given word have the same or the partial vocalisation like the pattern vocalized

Parameters:
  • word (unicode.) - arabic word, full/partial vocalized.
  • vocalized (unicode.) - arabic full vocalized word.
Returns: unicode.
True if vocalized.

Class Variable Details [hide private]

LETTERS

Value:
u''.join([ALEF, BEH, TEH, TEH_MARBUTA, THEH, JEEM, HAH, KHAH, DAL, THA\
L, REH, ZAIN, SEEN, SHEEN, SAD, DAD, TAH, ZAH, AIN, GHAIN, FEH, QAF, K\
AF, LAM, MEEM, NOON, HEH, WAW, YEH, HAMZA, ALEF_MADDA, ALEF_HAMZA_ABOV\
E, WAW_HAMZA, ALEF_HAMZA_BELOW, YEH_HAMZA,])

TASHKEEL

Value:
FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN, SHADDA

HARAKAT

Value:
FATHATAN, DAMMATAN, KASRATAN, FATHA, DAMMA, KASRA, SUKUN

LIGUATURES

Value:
LAM_ALEF, LAM_ALEF_HAMZA_ABOVE, LAM_ALEF_HAMZA_BELOW, LAM_ALEF_MADDA_A\
BOVE,

HAMZAT

Value:
HAMZA, WAW_HAMZA, YEH_HAMZA, HAMZA_ABOVE, HAMZA_BELOW, ALEF_HAMZA_BELO\
W, ALEF_HAMZA_ABOVE,

ALEFAT

Value:
ALEF, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, ALEF_WASLA, ALEF\
_MAKSURA, SMALL_ALEF,

MOON

Value:
HAMZA, ALEF_MADDA, ALEF_HAMZA_ABOVE, ALEF_HAMZA_BELOW, ALEF, BEH, JEEM\
, HAH, KHAH, AIN, GHAIN, FEH, QAF, KAF, MEEM, HEH, WAW, YEH,

SUN

Value:
TEH, THEH, DAL, THAL, REH, ZAIN, SEEN, SHEEN, SAD, DAD, TAH, ZAH, LAM,\
 NOON,

AlphabeticOrder

Value:
{ALEF: 1, BEH: 2, TEH: 3, TEH_MARBUTA: 3, THEH: 4, JEEM: 5, HAH: 6, KH\
AH: 7, DAL: 8, THAL: 9, REH: 10, ZAIN: 11, SEEN: 12, SHEEN: 13, SAD: 1\
4, DAD: 15, TAH: 16, ZAH: 17, AIN: 18, GHAIN: 19, FEH: 20, QAF: 21, KA\
F: 22, LAM: 23, MEEM: 24, NOON: 25, HEH: 26, WAW: 27, YEH: 28, HAMZA: \
29, ALEF_MADDA: 29, ALEF_HAMZA_ABOVE: 29, WAW_HAMZA: 29, ALEF_HAMZA_BE\
LOW: 29, YEH_HAMZA: 29,}