Module RTFDE.text_extraction

Expand source code
# -*- coding: utf-8 -*-
#
# This file is part of RTFDE, a RTF De-Encapsulator.
# Copyright © 2022 seamus tuohy, <code@seamustuohy.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.

import codecs
import re
from collections import namedtuple
from typing import Union, Any, List, Tuple, Dict

from oletools.common import codepages

from lark.lexer import Token
from lark.tree import Tree

from RTFDE.exceptions import MalformedRtf
from RTFDE.utils import is_codeword_with_numeric_arg
from RTFDE.utils import flatten_tree_to_string_array
from RTFDE.utils import log_text_extraction, is_logger_on

import logging
log = logging.getLogger("RTFDE")

fontdef = namedtuple("fontdef", ["fnum", "codepage", "codec", "fontdef_tree"])


def get_font_table(tree: Tree) -> Tree:
    """Extract the font table group from the first 20 tokens of a .rtf document.

Args:
    tree (Tree): A .rtf document object parsed into a Tree object

Raises:
    ValueError: If no group with a `\\fonttbl` token as its first controlword is found.

Returns:
    {'\\f0': fontdef(fnum='\\f0', codepage=932, codec='cp932', fontdef_tree='{\\f0\\fswiss\\fcharset128 MS PGothic;}'),
    '\\f1': fontdef(fnum='\\f1', codepage=None, codec=None, fontdef_tree='{\\f1\\fmodern MS Gothic;}'),
    '\\f2': fontdef(fnum='\\f2', codepage=None, codec=None, fontdef_tree='{\\f2\\fnil\\fcharset2 Symbol;}'),
    '\\f3': fontdef(fnum='\\f3', codepage=1252, codec='cp1252', fontdef_tree='{\\f3\\fmodern\\fcharset0 Courier New;}'),
    '\\f4': fontdef(fnum='\\f4', codepage=932, codec='cp932', fontdef_tree='{\\f4\\fswiss\\fcharset128 "PMingLiU";}'),
    '\\f5': fontdef(fnum='\\f5', codepage=None, codec=None, fontdef_tree='{\\f5\\fswiss "Amnesty Trade Gothic";}'),
    '\\f6': fontdef(fnum='\\f6', codepage=None, codec=None, fontdef_tree='{\\f6\\fswiss "Arial";}')}
    """
    for item in tree.children[:20]:
        if isinstance(item, Tree):
            try:
                ctrl_value = item.children[1]
            except IndexError as _e:
                continue
            if isinstance(ctrl_value, Token):
                table_type = ctrl_value.value.strip()
                if table_type == b"\\fonttbl":
                    return item
    raise ValueError("No font table found in tree")


def is_font_number(token: Token) -> bool:
    """Checks if an object is a "font number".

Returns:
    True if an object is a "font number" controlword `\\fN`. False if not.

"""
    try:
        if is_codeword_with_numeric_arg(token, b'\\f'):
            return True
    except AttributeError: # pragma: no cover
        return False
    return False

def get_codepage_num_from_fcharset(fcharsetN: int) -> Union[int,None]:
    """Return the codepage to use with a specific fcharsetN.

Args:
    fcharsetN (int): The numeric argument N for a \fcharsetN control word.

Returns:
    (int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.)

    """
    # Charset table retrieved on 2022-08-19
    # https://web.archive.org/web/20220819215334/https://docs.microsoft.com/en-us/previous-versions/cc194829%28v=msdn.10%29?redirectedfrom=MSDN
    charsets: dict[int,dict[str,Any]] = {
        0:{"name":"ANSI_CHARSET","hex":"0x00","decimal":0,"id":1252},
        1:{"name":"DEFAULT_CHARSET","hex":"0x01","decimal":1,"id":None},
        2:{"name":"SYMBOL_CHARSET","hex":"0x02","decimal":2,"id":None},
        128:{"name":"SHIFTJIS_CHARSET","hex":"0x80","decimal":128,"id":932},
        129:{"name":"HANGUL_CHARSET","hex":"0x81","decimal":129,"id":949},
        134:{"name":"GB2312_CHARSET","hex":"0x86","decimal":134,"id":936},
        136:{"name":"CHINESEBIG5_CHARSET","hex":"0x88","decimal":136,"id":950},
        161:{"name":"GREEK_CHARSET","hex":"0xA1","decimal":161,"id":1253},
        162:{"name":"TURKISH_CHARSET","hex":"0xA2","decimal":162,"id":1254},
        177:{"name":"HEBREW_CHARSET","hex":"0xB1","decimal":177,"id":1255},
        178:{"name":"ARABIC_CHARSET","hex":"0xB2","decimal":178,"id":1256},
        186:{"name":"BALTIC_CHARSET","hex":"0xBA","decimal":186,"id":1257},
        204:{"name":"RUSSIAN_CHARSET","hex":"0xCC","decimal":204,"id":1251},
        222:{"name":"THAI_CHARSET","hex":"0xDE","decimal":222,"id":874},
        238:{"name":"EE_CHARSET","hex":"0xEE","decimal":238,"id":1250},
        255:{"name":"OEM_CHARSET","hex":"0xFF","decimal":255,"id":None},
}
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction(f"Getting charset for {fcharsetN}")
    charset = charsets.get(fcharsetN, None)
    if charset is not None:
        charset_id = charset.get('id', None)
        return charset_id
    return None


def get_default_font(tree: Tree) -> Union[str,None]:
    """Extract the font number controlword default font if it exists.

If an RTF file uses a default font, the default font number is specified with the \\deffN control word, which must precede the font-table group.

Args:
    tree (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree object.

Returns:
    The default font control number if it exists from the first `\\deffN`. None if not found.
"""
    deff_gen = tree.scan_values(
        lambda v: is_codeword_with_numeric_arg(v, b'\\deff')
    )
    deff_options = list(deff_gen)
    try:
        # We just want the first \\deffN. It shouldn't be set multiple times.
        deff = deff_options[0]
        deff_num = deff.value[5:]
        return b'\\f' + deff_num
    except IndexError:
        return None

def parse_font_tree(font_tree: Tree) -> dict:
    """Create a font tree dictionary with appropriate codeces to decode text.

Args:
    font_tree (Tree): The .rtf font table object decoded as a tree.

Returns:
    A dictionary which maps font numbers to appropriate python codeces needed to decode text.
"""
    parsed_font_tree = {}
    for tree in font_tree.children:
        if isinstance(tree, Tree):
            fnum = None
            fcharset = None
            cpg_num = None
            for tok in tree.children:
                if is_codeword_with_numeric_arg(tok, b'\\f'):
                    fnum = tok.value
                elif is_codeword_with_numeric_arg(tok, b'\\fcharset'):
                    fchar_num = int(tok.value[9:])
                    fcharset = get_codepage_num_from_fcharset(fchar_num)
                elif is_codeword_with_numeric_arg(tok, b'\\cpg'):
                    cpg_num = int(tok.value[4:])
            if fnum is not None:
                # get the codepage
                codepage_num = None

                if fcharset is not None:
                    try:
                        codepage_num = check_codepage_num(fcharset)
                    except ValueError: # pragma: no cover
                        codepage_num = None
                # if both \\fcharset and \\cpg appear in the font table, \\cpg is ignored.
                if ((codepage_num is None) and (cpg_num is not None)):
                    try:
                        codepage_num = check_codepage_num(cpg_num)
                    except ValueError: # pragma: no cover
                        codepage_num = None
                # Get the appropriate codec
                if codepage_num is not None:
                    codec = get_python_codec(codepage_num)
                else:
                    codec = None
                # Only add if there is a font definition
                tree_str =  b"".join(list(flatten_tree_to_string_array(tree)))
                parsed_font_tree[fnum] = fontdef(fnum, codepage_num, codec, tree_str)
    return parsed_font_tree


def get_python_codec(codepage_num: int) -> str:
    """Returns the python codec needed to decode bytes to unicode.

Args:
    codepage_num (int): A codepage number.

Returns:
    The name of the codec in the Python codec registry. Used as the name for enacoding/decoding.
"""
    text_codec = codepages.codepage2codec(codepage_num)
    log.debug('Found python codec corresponding to code page {0}: {1}'.format(codepage_num, text_codec))
    return text_codec

def check_codepage_num(codepage_num: int) -> int:
    """Provide the codepage number back to you if it is valid.

Args:
    codepage_num (int): A possible codepage number.

Returns:
    The codepage number IF it is a valid codepage number

Raises:
    ValueError: The codepage_num provided isn't a valid codepage number.

"""
    # This keyword should be emitted in the RTF header section right after the \ansi, \mac, \pc or \pca keyword. But, various document tags like \fbids often are thrown all over the header so we have to check the first group of headers for it.
    # Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
    # Retrieved on 2020-12-18
    allowed_codepage_nums = set([37, 437, 500, 708, 709, 710, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863, 864, 865, 866, 869, 870, 874, 875, 932, 936, 949, 950, 1026, 1047, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1200, 1201, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361, 10000, 10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10010, 10017, 10021, 10029, 10079, 10081, 10082, 12000, 12001, 20000, 20001, 20002, 20003, 20004, 20005, 20105, 20106, 20107, 20108, 20127, 20261, 20269, 20273, 20277, 20278, 20280, 20284, 20285, 20290, 20297, 20420, 20423, 20424, 20833, 20838, 20866, 20871, 20880, 20905, 20924, 20932, 20936, 20949, 21025, 21027, 21866, 28591, 28592, 28593, 28594, 28595, 28596, 28597, 28598, 28599, 28603, 28605, 29001, 38598, 50220, 50221, 50222, 50225, 50227, 50229, 50930, 50931, 50933, 50935, 50936, 50937, 50939, 51932, 51936, 51949, 51950, 52936, 54936, 57002, 57003, 57004, 57005, 57006, 57007, 57008, 57009, 57010, 57011, 65000, 65001])
    if codepage_num in allowed_codepage_nums:
        return codepage_num
    # Note: If support for a specific codepage ever becomes an issue we can look at add support using the actual code-pages.
    # Conversion tables for codepages can be retrieved from here: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/
    raise ValueError(f"Unsupported unicode codepage number `{codepage_num}` found in the header")


def validate_ansi_cpg(header: str) -> None:
    """Check an '\\ansicpgNNNN' string to see if the number NNNN is an actual codepage.

Args:
    header (str): The value from the lark `\\ansicpg` CONTROLWORD Token.

Raises:
    MalformedRtf: If the value passed is not a valid ansi codepage.
"""
    try:
        possible_cpg_num = int(header.strip()[8:])
        check_codepage_num(possible_cpg_num)
    except ValueError as _e:
        raise MalformedRtf(f"Unsupported unicode codepage number `{header}` found in the header") from _e


# UNICODE CHARS
def unicode_escape_to_chr(item: bytes) -> str:
    """Convert unicode char from it's decimal to its unicode character representation. From "\\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents.

Args:
    item (str): A RTF Escape in the format \\u[-]NNNNN.

Returns:
    The unicode character representation of the identified character

Raises:
    ValueError: The escaped unicode character is not valid.
"""
    try:
        prefix = b'\\u'
        if item.startswith(prefix):
            nnnn = item[len(prefix):]
        else:
            nnnn = item
        nnnn = int(nnnn) # raises ValueError if not int.
    except ValueError as _e:
        raise ValueError(f"`{item}` is not a valid escaped unicode character.") from _e
    if nnnn < 0: # § -NNNNN is a negative integer expressed in decimal digits
        ncr = 65536 + nnnn
    else: # § NNNNN is a positive integer expressed in decimal digits
        ncr = nnnn
    # § HHHH is the hexadecimal equivalent of NNNNN or -NNNNN
    return chr(ncr)

def is_hex_encoded(item: Token) -> bool:
    """Identify if a token contains a HEXENCODED token.
Args:
    item (token): A token to check if it is HEXENCODED.

Return:
    True if HEXENCODED. False if not.
    """
    if isinstance(item, Token):
        if item.type == "HEXENCODED":
            return True
    return False

def is_valid_ANSI_representation_char(item: Token) -> bool:
    """Is token contain a valid ANSI representation string for a Unicode char.

Args:
    item (token): A token to check if it is a valid ANSI representation.

Return:
    True if token is an ansi representation of a unicode char. False if not.
"""
    if isinstance(item, Token):
        # print(f"found TOKEN posssible ansi {repr(item)}")
        if is_hex_encoded(item):
            # print(f"found hex posssible ansi {repr(item)}")
            return True
        if item.type == 'STRING':
            # print(f"found STRING posssible ansi {repr(item)}")
            if not item.value.isspace(): # whitespace doesn't count.
                # print(f"found posssible ansi {repr(item)}")
                return True
            # else:
            #     print(f"found SPACE posssible ansi {repr(item)}")
    # print(f"found NON TOKEN posssible ansi {repr(item)}")
    return False

def is_unicode_encoded(item: Token) -> bool:
    """Is token contain a unicode char.

Args:
    item (token): A token to check if contains a unicode char.

Return:
    True if token contains a unicode char. False if not.
"""
    if isinstance(item, Token):
        if item.type == "UNICODE":
            return True
    return False

def includes_unicode_chars(children: List[Token]) -> bool:
    """Does a list include Tokens which contain unicode characters. Not recursive.

Args:
    children (list): A Tree.children list to check to see if it includes unicode characters.

Returns:
    True if list includes tokens which contain unicode chars. False if not.
"""
    for child in children:
        if is_unicode_encoded(child):
            return True
    return False


def remove_unicode_replacements(children: List[Token],
                                return_ascii_map: bool = True,
                                byte_count: int = 1) -> Union[
                                    Tuple[List[Token], Dict[Token,List[Token]]],
                                    List[Token]]:
    """Remove all unicode replacement characters from a list of Tokens.

Args:
    children (list): A Tree.children list to remove unicode replacement characters from.
    return_ascii_map (bool): On True, have this function return a map of the ASCII token that were removed.
    byte_count (int): The number of bytes corresponding to a given \\uN Unicode character.  A default of 1 should be assumed if no \\uc keyword has been seen in the current or outer scopes.

Returns:
    new_children (list): The list of Tokens with all unicode replacement characters removed.
    ascii_map (dict): All the Tokens which were removed from the provided children list keyed by

"""
    byte_count = 1
    ascii_map: Dict[Token,List[Token]]  = {}
    new_children = []
    removal_map: List[Token] = []
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction(f"Removing unicode replacements on {repr(children)}")
    for child in children:
        if len(removal_map) > 0:
            if isinstance(child, Token):
                # Delete all spaces between a unicode char and the last ANSI representation
                # print(f"FOUND SPACE STRING with RM: {removal_map}")
                if child.value.isspace():
                    ascii_map.setdefault(removal_map[0], []).append(child)
                    continue
            if is_valid_ANSI_representation_char(child):
                # Found an ansi representation removing unicode char from removal map.
                # print(f"FOUND ASCII STRING {child} to RM with RM: {removal_map}")
                ascii_map.setdefault(removal_map.pop(), []).append(child)
                continue
            elif isinstance(child, Tree) and (
                    (child.data == "string") or (child.data == "hexarray")):
                # print(f"FOUND ASCII STRING {child} with RM: {removal_map}")
                ansi_children = child.children
                new_ansi_children = []
                for aci,ac in enumerate(ansi_children):
                    # print(f"AC CHILD {repr(ac)}")
                    if is_valid_ANSI_representation_char(ac):
                        # print(f"AC CHILD VALID {repr(ac)}")
                        if len(removal_map) > 0:
                            # print(f"AC CHILD MAP >0 {repr(ac)}")
                            # print(f"Popping removal for {repr(ac)}")
                            ascii_map.setdefault(removal_map.pop(), []).append(ac)
                        else:
                            # print(f"AC CHILD MAP < 0 {repr(ac)}")
                            new_ansi_children.append(ac)
                    else:
                        # print(f"AC CHILD NOT VALID {repr(ac)}")
                        new_ansi_children.append(ac)
                # print(f"NEW Children = {new_ansi_children}")
                if new_ansi_children == []:
                    from RTFDE.utils import make_token_replacement
                    # from RTFDE.utils import embed
                    # embed()
                    child = make_token_replacement("STRING", b"", child)
                else:
                    child.children = new_ansi_children
                # print(f"NEW Tree = {child}")
            # else:
                # print(f"FOUND ASCII STRING {child} with RM: {removal_map}")
                # print(f"{repr(child)} not a valid ANSI representation? with RM: {removal_map}")
        # Modify char byte count if we encounter it.
        if is_unicode_char_byte_count(child):
            byte_count = get_unicode_char_byte_count(child)
            # print(f"Changing byte count because {child} to {byte_count}")
        if is_unicode_encoded(child):
            # print(f"Found unicode {child}")
            for j in range(byte_count):
                # Add the unicode key to the removal map once per byte
                # This ensures we remove the right number of ANSI representation chars
                removal_map.append(child)
        new_children.append(child)
    if return_ascii_map is True:
        return new_children, ascii_map
    return new_children


# UNICODE SURROGATE CHARACTERS
def is_surrogate_high_char(item: bytes) -> bool:
    """Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate.

        item (bytes): A bytes representation of a string representing a unicode character. "\\u-10179"
    """
    if item.startswith(b"\\u"):
        item = item[2:]
    if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF:
        return True
    # In case unicode is NOT using the 16 bit signed integer
    elif 0xD800 <= int(item) <= 0xDBFF:
        return True
    return False

def is_surrogate_low_char(item: bytes) -> bool:
    """Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF."  Low-surrogate also sometimes known as following surrogates.

        item (bytes): A bytes representation of a string representing a unicode character.
    """
    if item.startswith(b"\\u"):
        item = item[2:]
    if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF:
        return True
    # In case unicode is NOT using the 16 bit signed integer
    elif 0xDC00 <= int(item) <= 0xDFFF:
        return True
    return False

def is_surrogate_16bit(item: bytes, cp_range) -> bool:
    """Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions.

Args:
    item (bytes): A bytes representation of a string representing a unicode character.
    cp_range (str): ['low' OR 'high'] The code point range (low-surrogate or high-surrogate).
    """
    if cp_range == 'low':
        if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF:
            return True
    elif cp_range == 'high':
        if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF:
            return True
    else:
        raise ValueError("cp_range must be either 'low' or 'high'")
    return False


def is_surrogate_pair(first: bytes, second: bytes) -> bool:
    """Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order.

Args:
    first (bytes): A bytes representation of a string representing the high-order byte in a surrogate char.
    second (bytes): A bytes representation of a string representing the low-order byte in a surrogate char.
    """
    if is_surrogate_high_char(first):
        if is_surrogate_low_char(second):
            return True
        else:
            log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(first, second))
    return False

def decode_surrogate_pair(high: bytes, low: bytes, encoding: str ='utf-16-le') -> bytes:
    """ Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent.

Args:
        high (bytes): the high-surrogate code point
        low (bytes): the low-surrogate code point
        encoding (str): The encoding to apply to the final value. Defaults to 'utf-16-le' because:  Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal.
    """
    # Equation for turning surrogate pairs into a unicode scalar value which be used with utl-16 can ONLY found in Unicode 3.0.0 standard.
    # Unicode scalar value means the same thing as "code position" or "code point"
     # https://www.unicode.org/versions/Unicode3.0.0/
     # section 3.7 https://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#page=9
    if high.startswith(b"\\u"):
        high = high[2:]
    if low.startswith(b"\\u"):
        low = low[2:]
    if is_surrogate_16bit(high, "high"):
        char_high = chr(65536+int(high))
    else:
        char_high = chr(int(high))
    if is_surrogate_16bit(low, "low"):
        char_low = chr(65536+int(low))
    else:
        char_low = chr(int(low))
    unicode_scalar_value = ((ord(char_high) - 0xD800) * 0x400) + (ord(char_low) - 0xDC00) + 0x10000
    unicode_bytes = chr(unicode_scalar_value).encode(encoding)
    return unicode_bytes.decode(encoding).encode()

def merge_surrogate_chars(children,
                          ascii_map,
                          use_ASCII_alternatives_on_unicode_decode_failure = False):
    """


Raises:
    ValueError:  A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character.
    """
    surrogate_start = None
    surrogate_high = None
    for i,c in enumerate(children):
        if isinstance(c, Tree):
            continue
        if is_unicode_encoded(c):
            if is_surrogate_high_char(c.value):
                surrogate_start = i
                surrogate_high = c
            elif surrogate_start is not None:
                if is_surrogate_low_char(c.value):
                    surrogate_low = c
                    try:
                        surrogate_value = decode_surrogate_pair(surrogate_high.value,
                                                                surrogate_low.value)
                        # Convert into STRING token
                        surrogate_tok = Token('STRING',
                                              surrogate_value,
                                              start_pos=surrogate_high.start_pos,
                                              end_pos=surrogate_low.end_pos,
                                              line=surrogate_high.line,
                                              end_line=surrogate_low.end_line,
                                              column=surrogate_high.column,
                                              end_column=surrogate_low.end_column)
                        children[surrogate_start] = surrogate_tok
                        blank_tok = Token('STRING',
                                          b"",
                                          start_pos=surrogate_high.start_pos+1,
                                          end_pos=surrogate_low.end_pos+1,
                                          line=surrogate_high.line,
                                          end_line=surrogate_low.end_line,
                                          column=surrogate_high.column,
                                          end_column=surrogate_low.end_column)
                        children[i] = blank_tok
                        surrogate_start = None
                        surrogate_high = None
                    except UnicodeDecodeError as _e:
                        if use_ASCII_alternatives_on_unicode_decode_failure is True:
                            children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]])
                            children[i] = b"".join([i.value for i in ascii_map[surrogate_low]])
                        else:
                            raise _e
                else:
                    log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(surrogate_high, surrogate_low))
                    if use_ASCII_alternatives_on_unicode_decode_failure is True:
                        children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]])
                    else:
                        raise ValueError("Standalone high-surrogate found. High surrogate followed by a illegal low-surrogate character.")
    return children



def is_unicode_char_byte_count(item: Token) -> bool:
    if isinstance(item, Token):
        if item.type == "CONTROLWORD":
            if item.value.startswith(b'\\uc'):
                return True
    return False

def get_unicode_char_byte_count(item: Token) -> int:
    item = item.value.decode()
    cur_uc = int(item[3:])
    return cur_uc


# Hex Encoded Chars
def has_hexarray(children: List[Union[Token, Tree]]) -> bool:
    """Checks if an tree's children includes a hexarray tree.

    children (array): the children object from a tree.
    """
    for item in children:
        if is_hexarray(item):
            return True
    return False

def is_hexarray(item):
    """Checks if an item is a hexarray tree.

    item (Tree or Token): an item to check to see if its a hex array
    """
    if isinstance(item, Tree):
        if item.data.value == 'hexarray':
            return True
    return False

def get_bytes_from_hex_encoded(item):
    """Convert hex encoded string to bytes.

    item (str): a hex encoded string in format \\'XX
    """
    hexstring = item.replace(b"\\'", b"")
    hex_bytes = bytes.fromhex(hexstring.decode())
    return hex_bytes

def decode_hex_char(item, codec):
    """Decode a bytes object using a specified codec.

    item (bytes): A bytes object.
    codec (str): The name of the codec to use to decode the bytes
    """
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction("decoding char {0} with font {1}".format(item, codec))
    if codec is None:
        # Default to U.S. Windows default codepage
        codec = 'CP1252'
    decoded = item.decode(codec)
    decoded = decoded.encode()
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction("char {0} decoded into {1} using codec {2}".format(item, decoded, codec))
    return decoded


class TextDecoder:

    def __init__(self, keep_fontdef=False,
               initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False):
        """
        keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed.
        initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the  information.
        use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for.

        """
        self.keep_fontdef = keep_fontdef
        self.ucbc = initial_byte_count
        self.use_ASCII_alternatives_on_unicode_decode_failure = use_ASCII_alternatives_on_unicode_decode_failure

        # Font table values set set_font_info
        self.default_font = None
        self.font_stack = []
        self.font_table = {}


    def set_font_info(self, obj: Tree):
        """

        obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
        """
        self.default_font = get_default_font(obj)
        self.font_stack = [self.default_font]
        raw_fonttbl = get_font_table(obj.children[1])
        self.font_table = parse_font_tree(raw_fonttbl)
        if is_logger_on("RTFDE.text_extraction") is True:
            log_text_extraction(f"FONT TABLE FOUND: {raw_fonttbl}")


    def update_children(self, obj: Tree):
        """

        obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
        """
        # Reset font info
        self.set_font_info(obj)
        children = obj.children
        obj.children = [i for i in self.iterate_on_children(children)]

    def prep_unicode(self, children: List[Token]):
        if includes_unicode_chars(children):
            # Clean out all replacement chars
            # log_text_extraction("Prepping Unicode Chars:" + repr(children))
            children, ascii_map = remove_unicode_replacements(children,
                                                              byte_count=self.ucbc)
            # print("===\nCHILD:" + repr(children))
            # print("===\nASCII:" + repr(ascii_map))
            # Merge all surrogate pairs
            children = merge_surrogate_chars(children,
                                             ascii_map,
                                             self.use_ASCII_alternatives_on_unicode_decode_failure)
            # print("FINAL CHILDREN")
            # log_text_extraction("Replaced Unicode Chars With: " + repr(children))
        return children

    def iterate_on_children(self, children): # Children should be 'List[Union[Token,Tree]]' but lark's Tree typing is defined badly.
        set_fonts = []
        if is_logger_on("RTFDE.text_extraction") is True:
            log_text_extraction("Starting to iterate on text extraction children...")
            log_text_extraction("PREP-BEFORE: "+repr(children))
        children = self.prep_unicode(children)
        if is_logger_on("RTFDE.text_extraction") is True:
            log_text_extraction("PREP-AFTER: "+repr(children))

        for item in children:
            if is_font_number(item): # Font Definitions
                self.font_stack.append(item.value.strip())
                set_fonts.append(item.value)
                if self.keep_fontdef is True:
                    yield item
            elif is_unicode_char_byte_count(item):
                bc = get_unicode_char_byte_count(item)
            elif is_unicode_encoded(item): # Unicode Chars
                decoded = unicode_escape_to_chr(item.value).encode()
                # Convert into STRING token
                decoded_tok = Token('STRING',
                                    decoded,
                                    start_pos=item.start_pos,
                                    end_pos=item.end_pos,
                                    line=item.line,
                                    end_line=item.end_line,
                                    column=item.column,
                                    end_column=item.end_column)
                if is_logger_on("RTFDE.text_extraction") is True:
                    log_text_extraction(f"UNICODE TOKEN {item}: {decoded_tok}")
                yield decoded_tok
            # Decode a hex array
            elif is_hexarray(item):
                # print("IS Hex?? {0}".format(item))
                base_bytes = None
                for hexchild in item.children:
                    if base_bytes is None:
                        base_bytes = get_bytes_from_hex_encoded(hexchild.value)
                    else:
                        base_bytes += get_bytes_from_hex_encoded(hexchild.value)
                current_fontdef = self.font_table[self.font_stack[-1]]
                current_codec = current_fontdef.codec
                decoded_hex = decode_hex_char(base_bytes, current_codec)
                # We are replacing a Tree. So, need item.data to access it's info token
                decoded_hex_tok = Token('STRING',
                                        decoded_hex,
                                        start_pos=item.data.start_pos,
                                        end_pos=item.data.end_pos,
                                        line=item.data.line,
                                        end_line=item.data.end_line,
                                        column=item.data.column,
                                        end_column=item.data.end_column)
                yield decoded_hex_tok
            elif isinstance(item, Tree):
                # Run this same function recursively on nested trees
                item.children = [i for i in self.iterate_on_children(item.children)]
                yield item
            else:
                yield item
        for i in set_fonts:
            # Remove all fonts defined while in this group
            self.font_stack.pop()

Functions

def check_codepage_num(codepage_num: int) ‑> int

Provide the codepage number back to you if it is valid.

Args

codepage_num : int
A possible codepage number.

Returns

The codepage number IF it is a valid codepage number

Raises

ValueError
The codepage_num provided isn't a valid codepage number.
Expand source code
def check_codepage_num(codepage_num: int) -> int:
    """Provide the codepage number back to you if it is valid.

Args:
    codepage_num (int): A possible codepage number.

Returns:
    The codepage number IF it is a valid codepage number

Raises:
    ValueError: The codepage_num provided isn't a valid codepage number.

"""
    # This keyword should be emitted in the RTF header section right after the \ansi, \mac, \pc or \pca keyword. But, various document tags like \fbids often are thrown all over the header so we have to check the first group of headers for it.
    # Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
    # Retrieved on 2020-12-18
    allowed_codepage_nums = set([37, 437, 500, 708, 709, 710, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863, 864, 865, 866, 869, 870, 874, 875, 932, 936, 949, 950, 1026, 1047, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1200, 1201, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361, 10000, 10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10010, 10017, 10021, 10029, 10079, 10081, 10082, 12000, 12001, 20000, 20001, 20002, 20003, 20004, 20005, 20105, 20106, 20107, 20108, 20127, 20261, 20269, 20273, 20277, 20278, 20280, 20284, 20285, 20290, 20297, 20420, 20423, 20424, 20833, 20838, 20866, 20871, 20880, 20905, 20924, 20932, 20936, 20949, 21025, 21027, 21866, 28591, 28592, 28593, 28594, 28595, 28596, 28597, 28598, 28599, 28603, 28605, 29001, 38598, 50220, 50221, 50222, 50225, 50227, 50229, 50930, 50931, 50933, 50935, 50936, 50937, 50939, 51932, 51936, 51949, 51950, 52936, 54936, 57002, 57003, 57004, 57005, 57006, 57007, 57008, 57009, 57010, 57011, 65000, 65001])
    if codepage_num in allowed_codepage_nums:
        return codepage_num
    # Note: If support for a specific codepage ever becomes an issue we can look at add support using the actual code-pages.
    # Conversion tables for codepages can be retrieved from here: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/
    raise ValueError(f"Unsupported unicode codepage number `{codepage_num}` found in the header")
def decode_hex_char(item, codec)

Decode a bytes object using a specified codec.

item (bytes): A bytes object. codec (str): The name of the codec to use to decode the bytes

Expand source code
def decode_hex_char(item, codec):
    """Decode a bytes object using a specified codec.

    item (bytes): A bytes object.
    codec (str): The name of the codec to use to decode the bytes
    """
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction("decoding char {0} with font {1}".format(item, codec))
    if codec is None:
        # Default to U.S. Windows default codepage
        codec = 'CP1252'
    decoded = item.decode(codec)
    decoded = decoded.encode()
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction("char {0} decoded into {1} using codec {2}".format(item, decoded, codec))
    return decoded
def decode_surrogate_pair(high: bytes, low: bytes, encoding: str = 'utf-16-le') ‑> bytes

Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent.

Args

high : bytes
the high-surrogate code point
low : bytes
the low-surrogate code point
encoding : str
The encoding to apply to the final value. Defaults to 'utf-16-le' because: Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal.
Expand source code
def decode_surrogate_pair(high: bytes, low: bytes, encoding: str ='utf-16-le') -> bytes:
    """ Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent.

Args:
        high (bytes): the high-surrogate code point
        low (bytes): the low-surrogate code point
        encoding (str): The encoding to apply to the final value. Defaults to 'utf-16-le' because:  Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal.
    """
    # Equation for turning surrogate pairs into a unicode scalar value which be used with utl-16 can ONLY found in Unicode 3.0.0 standard.
    # Unicode scalar value means the same thing as "code position" or "code point"
     # https://www.unicode.org/versions/Unicode3.0.0/
     # section 3.7 https://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#page=9
    if high.startswith(b"\\u"):
        high = high[2:]
    if low.startswith(b"\\u"):
        low = low[2:]
    if is_surrogate_16bit(high, "high"):
        char_high = chr(65536+int(high))
    else:
        char_high = chr(int(high))
    if is_surrogate_16bit(low, "low"):
        char_low = chr(65536+int(low))
    else:
        char_low = chr(int(low))
    unicode_scalar_value = ((ord(char_high) - 0xD800) * 0x400) + (ord(char_low) - 0xDC00) + 0x10000
    unicode_bytes = chr(unicode_scalar_value).encode(encoding)
    return unicode_bytes.decode(encoding).encode()
def get_bytes_from_hex_encoded(item)

Convert hex encoded string to bytes.

item (str): a hex encoded string in format 'XX

Expand source code
def get_bytes_from_hex_encoded(item):
    """Convert hex encoded string to bytes.

    item (str): a hex encoded string in format \\'XX
    """
    hexstring = item.replace(b"\\'", b"")
    hex_bytes = bytes.fromhex(hexstring.decode())
    return hex_bytes
def get_codepage_num_from_fcharset(fcharsetN: int) ‑> Optional[int]

Return the codepage to use with a specific fcharsetN.

Args

fcharsetN : int
The numeric argument N for a charsetN control word.

Returns

(int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.)

Expand source code
def get_codepage_num_from_fcharset(fcharsetN: int) -> Union[int,None]:
    """Return the codepage to use with a specific fcharsetN.

Args:
    fcharsetN (int): The numeric argument N for a \fcharsetN control word.

Returns:
    (int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.)

    """
    # Charset table retrieved on 2022-08-19
    # https://web.archive.org/web/20220819215334/https://docs.microsoft.com/en-us/previous-versions/cc194829%28v=msdn.10%29?redirectedfrom=MSDN
    charsets: dict[int,dict[str,Any]] = {
        0:{"name":"ANSI_CHARSET","hex":"0x00","decimal":0,"id":1252},
        1:{"name":"DEFAULT_CHARSET","hex":"0x01","decimal":1,"id":None},
        2:{"name":"SYMBOL_CHARSET","hex":"0x02","decimal":2,"id":None},
        128:{"name":"SHIFTJIS_CHARSET","hex":"0x80","decimal":128,"id":932},
        129:{"name":"HANGUL_CHARSET","hex":"0x81","decimal":129,"id":949},
        134:{"name":"GB2312_CHARSET","hex":"0x86","decimal":134,"id":936},
        136:{"name":"CHINESEBIG5_CHARSET","hex":"0x88","decimal":136,"id":950},
        161:{"name":"GREEK_CHARSET","hex":"0xA1","decimal":161,"id":1253},
        162:{"name":"TURKISH_CHARSET","hex":"0xA2","decimal":162,"id":1254},
        177:{"name":"HEBREW_CHARSET","hex":"0xB1","decimal":177,"id":1255},
        178:{"name":"ARABIC_CHARSET","hex":"0xB2","decimal":178,"id":1256},
        186:{"name":"BALTIC_CHARSET","hex":"0xBA","decimal":186,"id":1257},
        204:{"name":"RUSSIAN_CHARSET","hex":"0xCC","decimal":204,"id":1251},
        222:{"name":"THAI_CHARSET","hex":"0xDE","decimal":222,"id":874},
        238:{"name":"EE_CHARSET","hex":"0xEE","decimal":238,"id":1250},
        255:{"name":"OEM_CHARSET","hex":"0xFF","decimal":255,"id":None},
}
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction(f"Getting charset for {fcharsetN}")
    charset = charsets.get(fcharsetN, None)
    if charset is not None:
        charset_id = charset.get('id', None)
        return charset_id
    return None
def get_default_font(tree: lark.tree.Tree) ‑> Optional[str]

Extract the font number controlword default font if it exists.

If an RTF file uses a default font, the default font number is specified with the \deffN control word, which must precede the font-table group.

Args

tree : Tree
A lark Tree object. Should be the DeEncapsulator.full_tree object.

Returns

The default font control number if it exists from the first \deffN. None if not found.

Expand source code
def get_default_font(tree: Tree) -> Union[str,None]:
    """Extract the font number controlword default font if it exists.

If an RTF file uses a default font, the default font number is specified with the \\deffN control word, which must precede the font-table group.

Args:
    tree (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree object.

Returns:
    The default font control number if it exists from the first `\\deffN`. None if not found.
"""
    deff_gen = tree.scan_values(
        lambda v: is_codeword_with_numeric_arg(v, b'\\deff')
    )
    deff_options = list(deff_gen)
    try:
        # We just want the first \\deffN. It shouldn't be set multiple times.
        deff = deff_options[0]
        deff_num = deff.value[5:]
        return b'\\f' + deff_num
    except IndexError:
        return None
def get_font_table(tree: lark.tree.Tree) ‑> lark.tree.Tree

Extract the font table group from the first 20 tokens of a .rtf document.

Args

tree : Tree
A .rtf document object parsed into a Tree object

Raises

ValueError
If no group with a \fonttbl token as its first controlword is found.

Returns

{'\f0': fontdef(fnum='\f0', codepage=932, codec='cp932', fontdef_tree='{\f0\fswiss\fcharset128 MS PGothic;}'), '\f1': fontdef(fnum='\f1', codepage=None, codec=None, fontdef_tree='{\f1\fmodern MS Gothic;}'), '\f2': fontdef(fnum='\f2', codepage=None, codec=None, fontdef_tree='{\f2\fnil\fcharset2 Symbol;}'), '\f3': fontdef(fnum='\f3', codepage=1252, codec='cp1252', fontdef_tree='{\f3\fmodern\fcharset0 Courier New;}'), '\f4': fontdef(fnum='\f4', codepage=932, codec='cp932', fontdef_tree='{\f4\fswiss\fcharset128 "PMingLiU";}'), '\f5': fontdef(fnum='\f5', codepage=None, codec=None, fontdef_tree='{\f5\fswiss "Amnesty Trade Gothic";}'), '\f6': fontdef(fnum='\f6', codepage=None, codec=None, fontdef_tree='{\f6\fswiss "Arial";}')}

Expand source code
def get_font_table(tree: Tree) -> Tree:
    """Extract the font table group from the first 20 tokens of a .rtf document.

Args:
    tree (Tree): A .rtf document object parsed into a Tree object

Raises:
    ValueError: If no group with a `\\fonttbl` token as its first controlword is found.

Returns:
    {'\\f0': fontdef(fnum='\\f0', codepage=932, codec='cp932', fontdef_tree='{\\f0\\fswiss\\fcharset128 MS PGothic;}'),
    '\\f1': fontdef(fnum='\\f1', codepage=None, codec=None, fontdef_tree='{\\f1\\fmodern MS Gothic;}'),
    '\\f2': fontdef(fnum='\\f2', codepage=None, codec=None, fontdef_tree='{\\f2\\fnil\\fcharset2 Symbol;}'),
    '\\f3': fontdef(fnum='\\f3', codepage=1252, codec='cp1252', fontdef_tree='{\\f3\\fmodern\\fcharset0 Courier New;}'),
    '\\f4': fontdef(fnum='\\f4', codepage=932, codec='cp932', fontdef_tree='{\\f4\\fswiss\\fcharset128 "PMingLiU";}'),
    '\\f5': fontdef(fnum='\\f5', codepage=None, codec=None, fontdef_tree='{\\f5\\fswiss "Amnesty Trade Gothic";}'),
    '\\f6': fontdef(fnum='\\f6', codepage=None, codec=None, fontdef_tree='{\\f6\\fswiss "Arial";}')}
    """
    for item in tree.children[:20]:
        if isinstance(item, Tree):
            try:
                ctrl_value = item.children[1]
            except IndexError as _e:
                continue
            if isinstance(ctrl_value, Token):
                table_type = ctrl_value.value.strip()
                if table_type == b"\\fonttbl":
                    return item
    raise ValueError("No font table found in tree")
def get_python_codec(codepage_num: int) ‑> str

Returns the python codec needed to decode bytes to unicode.

Args

codepage_num : int
A codepage number.

Returns

The name of the codec in the Python codec registry. Used as the name for enacoding/decoding.

Expand source code
def get_python_codec(codepage_num: int) -> str:
    """Returns the python codec needed to decode bytes to unicode.

Args:
    codepage_num (int): A codepage number.

Returns:
    The name of the codec in the Python codec registry. Used as the name for enacoding/decoding.
"""
    text_codec = codepages.codepage2codec(codepage_num)
    log.debug('Found python codec corresponding to code page {0}: {1}'.format(codepage_num, text_codec))
    return text_codec
def get_unicode_char_byte_count(item: lark.lexer.Token) ‑> int
Expand source code
def get_unicode_char_byte_count(item: Token) -> int:
    item = item.value.decode()
    cur_uc = int(item[3:])
    return cur_uc
def has_hexarray(children: List[Union[lark.lexer.Token, lark.tree.Tree]]) ‑> bool

Checks if an tree's children includes a hexarray tree.

children (array): the children object from a tree.

Expand source code
def has_hexarray(children: List[Union[Token, Tree]]) -> bool:
    """Checks if an tree's children includes a hexarray tree.

    children (array): the children object from a tree.
    """
    for item in children:
        if is_hexarray(item):
            return True
    return False
def includes_unicode_chars(children: List[lark.lexer.Token]) ‑> bool

Does a list include Tokens which contain unicode characters. Not recursive.

Args

children : list
A Tree.children list to check to see if it includes unicode characters.

Returns

True if list includes tokens which contain unicode chars. False if not.

Expand source code
def includes_unicode_chars(children: List[Token]) -> bool:
    """Does a list include Tokens which contain unicode characters. Not recursive.

Args:
    children (list): A Tree.children list to check to see if it includes unicode characters.

Returns:
    True if list includes tokens which contain unicode chars. False if not.
"""
    for child in children:
        if is_unicode_encoded(child):
            return True
    return False
def is_font_number(token: lark.lexer.Token) ‑> bool

Checks if an object is a "font number".

Returns

True if an object is a "font number" controlword \fN. False if not.

Expand source code
def is_font_number(token: Token) -> bool:
    """Checks if an object is a "font number".

Returns:
    True if an object is a "font number" controlword `\\fN`. False if not.

"""
    try:
        if is_codeword_with_numeric_arg(token, b'\\f'):
            return True
    except AttributeError: # pragma: no cover
        return False
    return False
def is_hex_encoded(item: lark.lexer.Token) ‑> bool

Identify if a token contains a HEXENCODED token.

Args

item : token
A token to check if it is HEXENCODED.

Return

True if HEXENCODED. False if not.

Expand source code
def is_hex_encoded(item: Token) -> bool:
    """Identify if a token contains a HEXENCODED token.
Args:
    item (token): A token to check if it is HEXENCODED.

Return:
    True if HEXENCODED. False if not.
    """
    if isinstance(item, Token):
        if item.type == "HEXENCODED":
            return True
    return False
def is_hexarray(item)

Checks if an item is a hexarray tree.

item (Tree or Token): an item to check to see if its a hex array

Expand source code
def is_hexarray(item):
    """Checks if an item is a hexarray tree.

    item (Tree or Token): an item to check to see if its a hex array
    """
    if isinstance(item, Tree):
        if item.data.value == 'hexarray':
            return True
    return False
def is_surrogate_16bit(item: bytes, cp_range) ‑> bool

Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions.

Args

item : bytes
A bytes representation of a string representing a unicode character.
cp_range : str
['low' OR 'high'] The code point range (low-surrogate or high-surrogate).
Expand source code
def is_surrogate_16bit(item: bytes, cp_range) -> bool:
    """Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions.

Args:
    item (bytes): A bytes representation of a string representing a unicode character.
    cp_range (str): ['low' OR 'high'] The code point range (low-surrogate or high-surrogate).
    """
    if cp_range == 'low':
        if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF:
            return True
    elif cp_range == 'high':
        if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF:
            return True
    else:
        raise ValueError("cp_range must be either 'low' or 'high'")
    return False
def is_surrogate_high_char(item: bytes) ‑> bool

Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate.

item (bytes): A bytes representation of a string representing a unicode character. "\u-10179"

Expand source code
def is_surrogate_high_char(item: bytes) -> bool:
    """Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate.

        item (bytes): A bytes representation of a string representing a unicode character. "\\u-10179"
    """
    if item.startswith(b"\\u"):
        item = item[2:]
    if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF:
        return True
    # In case unicode is NOT using the 16 bit signed integer
    elif 0xD800 <= int(item) <= 0xDBFF:
        return True
    return False
def is_surrogate_low_char(item: bytes) ‑> bool

Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF." Low-surrogate also sometimes known as following surrogates.

item (bytes): A bytes representation of a string representing a unicode character.

Expand source code
def is_surrogate_low_char(item: bytes) -> bool:
    """Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF."  Low-surrogate also sometimes known as following surrogates.

        item (bytes): A bytes representation of a string representing a unicode character.
    """
    if item.startswith(b"\\u"):
        item = item[2:]
    if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF:
        return True
    # In case unicode is NOT using the 16 bit signed integer
    elif 0xDC00 <= int(item) <= 0xDFFF:
        return True
    return False
def is_surrogate_pair(first: bytes, second: bytes) ‑> bool

Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order.

Args

first : bytes
A bytes representation of a string representing the high-order byte in a surrogate char.
second : bytes
A bytes representation of a string representing the low-order byte in a surrogate char.
Expand source code
def is_surrogate_pair(first: bytes, second: bytes) -> bool:
    """Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order.

Args:
    first (bytes): A bytes representation of a string representing the high-order byte in a surrogate char.
    second (bytes): A bytes representation of a string representing the low-order byte in a surrogate char.
    """
    if is_surrogate_high_char(first):
        if is_surrogate_low_char(second):
            return True
        else:
            log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(first, second))
    return False
def is_unicode_char_byte_count(item: lark.lexer.Token) ‑> bool
Expand source code
def is_unicode_char_byte_count(item: Token) -> bool:
    if isinstance(item, Token):
        if item.type == "CONTROLWORD":
            if item.value.startswith(b'\\uc'):
                return True
    return False
def is_unicode_encoded(item: lark.lexer.Token) ‑> bool

Is token contain a unicode char.

Args

item : token
A token to check if contains a unicode char.

Return

True if token contains a unicode char. False if not.

Expand source code
def is_unicode_encoded(item: Token) -> bool:
    """Is token contain a unicode char.

Args:
    item (token): A token to check if contains a unicode char.

Return:
    True if token contains a unicode char. False if not.
"""
    if isinstance(item, Token):
        if item.type == "UNICODE":
            return True
    return False
def is_valid_ANSI_representation_char(item: lark.lexer.Token) ‑> bool

Is token contain a valid ANSI representation string for a Unicode char.

Args

item : token
A token to check if it is a valid ANSI representation.

Return

True if token is an ansi representation of a unicode char. False if not.

Expand source code
def is_valid_ANSI_representation_char(item: Token) -> bool:
    """Is token contain a valid ANSI representation string for a Unicode char.

Args:
    item (token): A token to check if it is a valid ANSI representation.

Return:
    True if token is an ansi representation of a unicode char. False if not.
"""
    if isinstance(item, Token):
        # print(f"found TOKEN posssible ansi {repr(item)}")
        if is_hex_encoded(item):
            # print(f"found hex posssible ansi {repr(item)}")
            return True
        if item.type == 'STRING':
            # print(f"found STRING posssible ansi {repr(item)}")
            if not item.value.isspace(): # whitespace doesn't count.
                # print(f"found posssible ansi {repr(item)}")
                return True
            # else:
            #     print(f"found SPACE posssible ansi {repr(item)}")
    # print(f"found NON TOKEN posssible ansi {repr(item)}")
    return False
def merge_surrogate_chars(children, ascii_map, use_ASCII_alternatives_on_unicode_decode_failure=False)

Raises

ValueError
A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character.
Expand source code
def merge_surrogate_chars(children,
                          ascii_map,
                          use_ASCII_alternatives_on_unicode_decode_failure = False):
    """


Raises:
    ValueError:  A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character.
    """
    surrogate_start = None
    surrogate_high = None
    for i,c in enumerate(children):
        if isinstance(c, Tree):
            continue
        if is_unicode_encoded(c):
            if is_surrogate_high_char(c.value):
                surrogate_start = i
                surrogate_high = c
            elif surrogate_start is not None:
                if is_surrogate_low_char(c.value):
                    surrogate_low = c
                    try:
                        surrogate_value = decode_surrogate_pair(surrogate_high.value,
                                                                surrogate_low.value)
                        # Convert into STRING token
                        surrogate_tok = Token('STRING',
                                              surrogate_value,
                                              start_pos=surrogate_high.start_pos,
                                              end_pos=surrogate_low.end_pos,
                                              line=surrogate_high.line,
                                              end_line=surrogate_low.end_line,
                                              column=surrogate_high.column,
                                              end_column=surrogate_low.end_column)
                        children[surrogate_start] = surrogate_tok
                        blank_tok = Token('STRING',
                                          b"",
                                          start_pos=surrogate_high.start_pos+1,
                                          end_pos=surrogate_low.end_pos+1,
                                          line=surrogate_high.line,
                                          end_line=surrogate_low.end_line,
                                          column=surrogate_high.column,
                                          end_column=surrogate_low.end_column)
                        children[i] = blank_tok
                        surrogate_start = None
                        surrogate_high = None
                    except UnicodeDecodeError as _e:
                        if use_ASCII_alternatives_on_unicode_decode_failure is True:
                            children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]])
                            children[i] = b"".join([i.value for i in ascii_map[surrogate_low]])
                        else:
                            raise _e
                else:
                    log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(surrogate_high, surrogate_low))
                    if use_ASCII_alternatives_on_unicode_decode_failure is True:
                        children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]])
                    else:
                        raise ValueError("Standalone high-surrogate found. High surrogate followed by a illegal low-surrogate character.")
    return children
def parse_font_tree(font_tree: lark.tree.Tree) ‑> dict

Create a font tree dictionary with appropriate codeces to decode text.

Args

font_tree : Tree
The .rtf font table object decoded as a tree.

Returns

A dictionary which maps font numbers to appropriate python codeces needed to decode text.

Expand source code
def parse_font_tree(font_tree: Tree) -> dict:
    """Create a font tree dictionary with appropriate codeces to decode text.

Args:
    font_tree (Tree): The .rtf font table object decoded as a tree.

Returns:
    A dictionary which maps font numbers to appropriate python codeces needed to decode text.
"""
    parsed_font_tree = {}
    for tree in font_tree.children:
        if isinstance(tree, Tree):
            fnum = None
            fcharset = None
            cpg_num = None
            for tok in tree.children:
                if is_codeword_with_numeric_arg(tok, b'\\f'):
                    fnum = tok.value
                elif is_codeword_with_numeric_arg(tok, b'\\fcharset'):
                    fchar_num = int(tok.value[9:])
                    fcharset = get_codepage_num_from_fcharset(fchar_num)
                elif is_codeword_with_numeric_arg(tok, b'\\cpg'):
                    cpg_num = int(tok.value[4:])
            if fnum is not None:
                # get the codepage
                codepage_num = None

                if fcharset is not None:
                    try:
                        codepage_num = check_codepage_num(fcharset)
                    except ValueError: # pragma: no cover
                        codepage_num = None
                # if both \\fcharset and \\cpg appear in the font table, \\cpg is ignored.
                if ((codepage_num is None) and (cpg_num is not None)):
                    try:
                        codepage_num = check_codepage_num(cpg_num)
                    except ValueError: # pragma: no cover
                        codepage_num = None
                # Get the appropriate codec
                if codepage_num is not None:
                    codec = get_python_codec(codepage_num)
                else:
                    codec = None
                # Only add if there is a font definition
                tree_str =  b"".join(list(flatten_tree_to_string_array(tree)))
                parsed_font_tree[fnum] = fontdef(fnum, codepage_num, codec, tree_str)
    return parsed_font_tree
def remove_unicode_replacements(children: List[lark.lexer.Token], return_ascii_map: bool = True, byte_count: int = 1) ‑> Union[Tuple[List[lark.lexer.Token], Dict[lark.lexer.Token, List[lark.lexer.Token]]], List[lark.lexer.Token]]

Remove all unicode replacement characters from a list of Tokens.

Args

children : list
A Tree.children list to remove unicode replacement characters from.
return_ascii_map : bool
On True, have this function return a map of the ASCII token that were removed.
byte_count : int
The number of bytes corresponding to a given \uN Unicode character. A default of 1 should be assumed if no \uc keyword has been seen in the current or outer scopes.

Returns

new_children (list): The list of Tokens with all unicode replacement characters removed. ascii_map (dict): All the Tokens which were removed from the provided children list keyed by

Expand source code
def remove_unicode_replacements(children: List[Token],
                                return_ascii_map: bool = True,
                                byte_count: int = 1) -> Union[
                                    Tuple[List[Token], Dict[Token,List[Token]]],
                                    List[Token]]:
    """Remove all unicode replacement characters from a list of Tokens.

Args:
    children (list): A Tree.children list to remove unicode replacement characters from.
    return_ascii_map (bool): On True, have this function return a map of the ASCII token that were removed.
    byte_count (int): The number of bytes corresponding to a given \\uN Unicode character.  A default of 1 should be assumed if no \\uc keyword has been seen in the current or outer scopes.

Returns:
    new_children (list): The list of Tokens with all unicode replacement characters removed.
    ascii_map (dict): All the Tokens which were removed from the provided children list keyed by

"""
    byte_count = 1
    ascii_map: Dict[Token,List[Token]]  = {}
    new_children = []
    removal_map: List[Token] = []
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction(f"Removing unicode replacements on {repr(children)}")
    for child in children:
        if len(removal_map) > 0:
            if isinstance(child, Token):
                # Delete all spaces between a unicode char and the last ANSI representation
                # print(f"FOUND SPACE STRING with RM: {removal_map}")
                if child.value.isspace():
                    ascii_map.setdefault(removal_map[0], []).append(child)
                    continue
            if is_valid_ANSI_representation_char(child):
                # Found an ansi representation removing unicode char from removal map.
                # print(f"FOUND ASCII STRING {child} to RM with RM: {removal_map}")
                ascii_map.setdefault(removal_map.pop(), []).append(child)
                continue
            elif isinstance(child, Tree) and (
                    (child.data == "string") or (child.data == "hexarray")):
                # print(f"FOUND ASCII STRING {child} with RM: {removal_map}")
                ansi_children = child.children
                new_ansi_children = []
                for aci,ac in enumerate(ansi_children):
                    # print(f"AC CHILD {repr(ac)}")
                    if is_valid_ANSI_representation_char(ac):
                        # print(f"AC CHILD VALID {repr(ac)}")
                        if len(removal_map) > 0:
                            # print(f"AC CHILD MAP >0 {repr(ac)}")
                            # print(f"Popping removal for {repr(ac)}")
                            ascii_map.setdefault(removal_map.pop(), []).append(ac)
                        else:
                            # print(f"AC CHILD MAP < 0 {repr(ac)}")
                            new_ansi_children.append(ac)
                    else:
                        # print(f"AC CHILD NOT VALID {repr(ac)}")
                        new_ansi_children.append(ac)
                # print(f"NEW Children = {new_ansi_children}")
                if new_ansi_children == []:
                    from RTFDE.utils import make_token_replacement
                    # from RTFDE.utils import embed
                    # embed()
                    child = make_token_replacement("STRING", b"", child)
                else:
                    child.children = new_ansi_children
                # print(f"NEW Tree = {child}")
            # else:
                # print(f"FOUND ASCII STRING {child} with RM: {removal_map}")
                # print(f"{repr(child)} not a valid ANSI representation? with RM: {removal_map}")
        # Modify char byte count if we encounter it.
        if is_unicode_char_byte_count(child):
            byte_count = get_unicode_char_byte_count(child)
            # print(f"Changing byte count because {child} to {byte_count}")
        if is_unicode_encoded(child):
            # print(f"Found unicode {child}")
            for j in range(byte_count):
                # Add the unicode key to the removal map once per byte
                # This ensures we remove the right number of ANSI representation chars
                removal_map.append(child)
        new_children.append(child)
    if return_ascii_map is True:
        return new_children, ascii_map
    return new_children
def unicode_escape_to_chr(item: bytes) ‑> str

Convert unicode char from it's decimal to its unicode character representation. From "\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents.

Args

item : str
A RTF Escape in the format \u[-]NNNNN.

Returns

The unicode character representation of the identified character

Raises

ValueError
The escaped unicode character is not valid.
Expand source code
def unicode_escape_to_chr(item: bytes) -> str:
    """Convert unicode char from it's decimal to its unicode character representation. From "\\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents.

Args:
    item (str): A RTF Escape in the format \\u[-]NNNNN.

Returns:
    The unicode character representation of the identified character

Raises:
    ValueError: The escaped unicode character is not valid.
"""
    try:
        prefix = b'\\u'
        if item.startswith(prefix):
            nnnn = item[len(prefix):]
        else:
            nnnn = item
        nnnn = int(nnnn) # raises ValueError if not int.
    except ValueError as _e:
        raise ValueError(f"`{item}` is not a valid escaped unicode character.") from _e
    if nnnn < 0: # § -NNNNN is a negative integer expressed in decimal digits
        ncr = 65536 + nnnn
    else: # § NNNNN is a positive integer expressed in decimal digits
        ncr = nnnn
    # § HHHH is the hexadecimal equivalent of NNNNN or -NNNNN
    return chr(ncr)
def validate_ansi_cpg(header: str) ‑> None

Check an '\ansicpgNNNN' string to see if the number NNNN is an actual codepage.

Args

header : str
The value from the lark \ansicpg CONTROLWORD Token.

Raises

MalformedRtf
If the value passed is not a valid ansi codepage.
Expand source code
def validate_ansi_cpg(header: str) -> None:
    """Check an '\\ansicpgNNNN' string to see if the number NNNN is an actual codepage.

Args:
    header (str): The value from the lark `\\ansicpg` CONTROLWORD Token.

Raises:
    MalformedRtf: If the value passed is not a valid ansi codepage.
"""
    try:
        possible_cpg_num = int(header.strip()[8:])
        check_codepage_num(possible_cpg_num)
    except ValueError as _e:
        raise MalformedRtf(f"Unsupported unicode codepage number `{header}` found in the header") from _e

Classes

class TextDecoder (keep_fontdef=False, initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False)

keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed. initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the information. use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for.

Expand source code
class TextDecoder:

    def __init__(self, keep_fontdef=False,
               initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False):
        """
        keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed.
        initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the  information.
        use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for.

        """
        self.keep_fontdef = keep_fontdef
        self.ucbc = initial_byte_count
        self.use_ASCII_alternatives_on_unicode_decode_failure = use_ASCII_alternatives_on_unicode_decode_failure

        # Font table values set set_font_info
        self.default_font = None
        self.font_stack = []
        self.font_table = {}


    def set_font_info(self, obj: Tree):
        """

        obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
        """
        self.default_font = get_default_font(obj)
        self.font_stack = [self.default_font]
        raw_fonttbl = get_font_table(obj.children[1])
        self.font_table = parse_font_tree(raw_fonttbl)
        if is_logger_on("RTFDE.text_extraction") is True:
            log_text_extraction(f"FONT TABLE FOUND: {raw_fonttbl}")


    def update_children(self, obj: Tree):
        """

        obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
        """
        # Reset font info
        self.set_font_info(obj)
        children = obj.children
        obj.children = [i for i in self.iterate_on_children(children)]

    def prep_unicode(self, children: List[Token]):
        if includes_unicode_chars(children):
            # Clean out all replacement chars
            # log_text_extraction("Prepping Unicode Chars:" + repr(children))
            children, ascii_map = remove_unicode_replacements(children,
                                                              byte_count=self.ucbc)
            # print("===\nCHILD:" + repr(children))
            # print("===\nASCII:" + repr(ascii_map))
            # Merge all surrogate pairs
            children = merge_surrogate_chars(children,
                                             ascii_map,
                                             self.use_ASCII_alternatives_on_unicode_decode_failure)
            # print("FINAL CHILDREN")
            # log_text_extraction("Replaced Unicode Chars With: " + repr(children))
        return children

    def iterate_on_children(self, children): # Children should be 'List[Union[Token,Tree]]' but lark's Tree typing is defined badly.
        set_fonts = []
        if is_logger_on("RTFDE.text_extraction") is True:
            log_text_extraction("Starting to iterate on text extraction children...")
            log_text_extraction("PREP-BEFORE: "+repr(children))
        children = self.prep_unicode(children)
        if is_logger_on("RTFDE.text_extraction") is True:
            log_text_extraction("PREP-AFTER: "+repr(children))

        for item in children:
            if is_font_number(item): # Font Definitions
                self.font_stack.append(item.value.strip())
                set_fonts.append(item.value)
                if self.keep_fontdef is True:
                    yield item
            elif is_unicode_char_byte_count(item):
                bc = get_unicode_char_byte_count(item)
            elif is_unicode_encoded(item): # Unicode Chars
                decoded = unicode_escape_to_chr(item.value).encode()
                # Convert into STRING token
                decoded_tok = Token('STRING',
                                    decoded,
                                    start_pos=item.start_pos,
                                    end_pos=item.end_pos,
                                    line=item.line,
                                    end_line=item.end_line,
                                    column=item.column,
                                    end_column=item.end_column)
                if is_logger_on("RTFDE.text_extraction") is True:
                    log_text_extraction(f"UNICODE TOKEN {item}: {decoded_tok}")
                yield decoded_tok
            # Decode a hex array
            elif is_hexarray(item):
                # print("IS Hex?? {0}".format(item))
                base_bytes = None
                for hexchild in item.children:
                    if base_bytes is None:
                        base_bytes = get_bytes_from_hex_encoded(hexchild.value)
                    else:
                        base_bytes += get_bytes_from_hex_encoded(hexchild.value)
                current_fontdef = self.font_table[self.font_stack[-1]]
                current_codec = current_fontdef.codec
                decoded_hex = decode_hex_char(base_bytes, current_codec)
                # We are replacing a Tree. So, need item.data to access it's info token
                decoded_hex_tok = Token('STRING',
                                        decoded_hex,
                                        start_pos=item.data.start_pos,
                                        end_pos=item.data.end_pos,
                                        line=item.data.line,
                                        end_line=item.data.end_line,
                                        column=item.data.column,
                                        end_column=item.data.end_column)
                yield decoded_hex_tok
            elif isinstance(item, Tree):
                # Run this same function recursively on nested trees
                item.children = [i for i in self.iterate_on_children(item.children)]
                yield item
            else:
                yield item
        for i in set_fonts:
            # Remove all fonts defined while in this group
            self.font_stack.pop()

Methods

def iterate_on_children(self, children)
Expand source code
def iterate_on_children(self, children): # Children should be 'List[Union[Token,Tree]]' but lark's Tree typing is defined badly.
    set_fonts = []
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction("Starting to iterate on text extraction children...")
        log_text_extraction("PREP-BEFORE: "+repr(children))
    children = self.prep_unicode(children)
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction("PREP-AFTER: "+repr(children))

    for item in children:
        if is_font_number(item): # Font Definitions
            self.font_stack.append(item.value.strip())
            set_fonts.append(item.value)
            if self.keep_fontdef is True:
                yield item
        elif is_unicode_char_byte_count(item):
            bc = get_unicode_char_byte_count(item)
        elif is_unicode_encoded(item): # Unicode Chars
            decoded = unicode_escape_to_chr(item.value).encode()
            # Convert into STRING token
            decoded_tok = Token('STRING',
                                decoded,
                                start_pos=item.start_pos,
                                end_pos=item.end_pos,
                                line=item.line,
                                end_line=item.end_line,
                                column=item.column,
                                end_column=item.end_column)
            if is_logger_on("RTFDE.text_extraction") is True:
                log_text_extraction(f"UNICODE TOKEN {item}: {decoded_tok}")
            yield decoded_tok
        # Decode a hex array
        elif is_hexarray(item):
            # print("IS Hex?? {0}".format(item))
            base_bytes = None
            for hexchild in item.children:
                if base_bytes is None:
                    base_bytes = get_bytes_from_hex_encoded(hexchild.value)
                else:
                    base_bytes += get_bytes_from_hex_encoded(hexchild.value)
            current_fontdef = self.font_table[self.font_stack[-1]]
            current_codec = current_fontdef.codec
            decoded_hex = decode_hex_char(base_bytes, current_codec)
            # We are replacing a Tree. So, need item.data to access it's info token
            decoded_hex_tok = Token('STRING',
                                    decoded_hex,
                                    start_pos=item.data.start_pos,
                                    end_pos=item.data.end_pos,
                                    line=item.data.line,
                                    end_line=item.data.end_line,
                                    column=item.data.column,
                                    end_column=item.data.end_column)
            yield decoded_hex_tok
        elif isinstance(item, Tree):
            # Run this same function recursively on nested trees
            item.children = [i for i in self.iterate_on_children(item.children)]
            yield item
        else:
            yield item
    for i in set_fonts:
        # Remove all fonts defined while in this group
        self.font_stack.pop()
def prep_unicode(self, children: List[lark.lexer.Token])
Expand source code
def prep_unicode(self, children: List[Token]):
    if includes_unicode_chars(children):
        # Clean out all replacement chars
        # log_text_extraction("Prepping Unicode Chars:" + repr(children))
        children, ascii_map = remove_unicode_replacements(children,
                                                          byte_count=self.ucbc)
        # print("===\nCHILD:" + repr(children))
        # print("===\nASCII:" + repr(ascii_map))
        # Merge all surrogate pairs
        children = merge_surrogate_chars(children,
                                         ascii_map,
                                         self.use_ASCII_alternatives_on_unicode_decode_failure)
        # print("FINAL CHILDREN")
        # log_text_extraction("Replaced Unicode Chars With: " + repr(children))
    return children
def set_font_info(self, obj: lark.tree.Tree)

obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.

Expand source code
def set_font_info(self, obj: Tree):
    """

    obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
    """
    self.default_font = get_default_font(obj)
    self.font_stack = [self.default_font]
    raw_fonttbl = get_font_table(obj.children[1])
    self.font_table = parse_font_tree(raw_fonttbl)
    if is_logger_on("RTFDE.text_extraction") is True:
        log_text_extraction(f"FONT TABLE FOUND: {raw_fonttbl}")
def update_children(self, obj: lark.tree.Tree)

obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.

Expand source code
def update_children(self, obj: Tree):
    """

    obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
    """
    # Reset font info
    self.set_font_info(obj)
    children = obj.children
    obj.children = [i for i in self.iterate_on_children(children)]
class fontdef (fnum, codepage, codec, fontdef_tree)

fontdef(fnum, codepage, codec, fontdef_tree)

Ancestors

  • builtins.tuple

Instance variables

var codec

Alias for field number 2

var codepage

Alias for field number 1

var fnum

Alias for field number 0

var fontdef_tree

Alias for field number 3