Module RTFDE.text_extraction
Expand source code
# -*- coding: utf-8 -*-
#
# This file is part of RTFDE, a RTF De-Encapsulator.
# Copyright © 2022 seamus tuohy, <code@seamustuohy.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.
import codecs
import re
from collections import namedtuple
from typing import Union, Any, List, Tuple, Dict
from oletools.common import codepages
from lark.lexer import Token
from lark.tree import Tree
from RTFDE.exceptions import MalformedRtf
from RTFDE.utils import is_codeword_with_numeric_arg
from RTFDE.utils import flatten_tree_to_string_array
from RTFDE.utils import log_text_extraction, is_logger_on
import logging
log = logging.getLogger("RTFDE")
fontdef = namedtuple("fontdef", ["fnum", "codepage", "codec", "fontdef_tree"])
def get_font_table(tree: Tree) -> Tree:
"""Extract the font table group from the first 20 tokens of a .rtf document.
Args:
tree (Tree): A .rtf document object parsed into a Tree object
Raises:
ValueError: If no group with a `\\fonttbl` token as its first controlword is found.
Returns:
{'\\f0': fontdef(fnum='\\f0', codepage=932, codec='cp932', fontdef_tree='{\\f0\\fswiss\\fcharset128 MS PGothic;}'),
'\\f1': fontdef(fnum='\\f1', codepage=None, codec=None, fontdef_tree='{\\f1\\fmodern MS Gothic;}'),
'\\f2': fontdef(fnum='\\f2', codepage=None, codec=None, fontdef_tree='{\\f2\\fnil\\fcharset2 Symbol;}'),
'\\f3': fontdef(fnum='\\f3', codepage=1252, codec='cp1252', fontdef_tree='{\\f3\\fmodern\\fcharset0 Courier New;}'),
'\\f4': fontdef(fnum='\\f4', codepage=932, codec='cp932', fontdef_tree='{\\f4\\fswiss\\fcharset128 "PMingLiU";}'),
'\\f5': fontdef(fnum='\\f5', codepage=None, codec=None, fontdef_tree='{\\f5\\fswiss "Amnesty Trade Gothic";}'),
'\\f6': fontdef(fnum='\\f6', codepage=None, codec=None, fontdef_tree='{\\f6\\fswiss "Arial";}')}
"""
for item in tree.children[:20]:
if isinstance(item, Tree):
try:
ctrl_value = item.children[1]
except IndexError as _e:
continue
if isinstance(ctrl_value, Token):
table_type = ctrl_value.value.strip()
if table_type == b"\\fonttbl":
return item
raise ValueError("No font table found in tree")
def is_font_number(token: Token) -> bool:
"""Checks if an object is a "font number".
Returns:
True if an object is a "font number" controlword `\\fN`. False if not.
"""
try:
if is_codeword_with_numeric_arg(token, b'\\f'):
return True
except AttributeError: # pragma: no cover
return False
return False
def get_codepage_num_from_fcharset(fcharsetN: int) -> Union[int,None]:
"""Return the codepage to use with a specific fcharsetN.
Args:
fcharsetN (int): The numeric argument N for a \fcharsetN control word.
Returns:
(int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.)
"""
# Charset table retrieved on 2022-08-19
# https://web.archive.org/web/20220819215334/https://docs.microsoft.com/en-us/previous-versions/cc194829%28v=msdn.10%29?redirectedfrom=MSDN
charsets: dict[int,dict[str,Any]] = {
0:{"name":"ANSI_CHARSET","hex":"0x00","decimal":0,"id":1252},
1:{"name":"DEFAULT_CHARSET","hex":"0x01","decimal":1,"id":None},
2:{"name":"SYMBOL_CHARSET","hex":"0x02","decimal":2,"id":None},
128:{"name":"SHIFTJIS_CHARSET","hex":"0x80","decimal":128,"id":932},
129:{"name":"HANGUL_CHARSET","hex":"0x81","decimal":129,"id":949},
134:{"name":"GB2312_CHARSET","hex":"0x86","decimal":134,"id":936},
136:{"name":"CHINESEBIG5_CHARSET","hex":"0x88","decimal":136,"id":950},
161:{"name":"GREEK_CHARSET","hex":"0xA1","decimal":161,"id":1253},
162:{"name":"TURKISH_CHARSET","hex":"0xA2","decimal":162,"id":1254},
177:{"name":"HEBREW_CHARSET","hex":"0xB1","decimal":177,"id":1255},
178:{"name":"ARABIC_CHARSET","hex":"0xB2","decimal":178,"id":1256},
186:{"name":"BALTIC_CHARSET","hex":"0xBA","decimal":186,"id":1257},
204:{"name":"RUSSIAN_CHARSET","hex":"0xCC","decimal":204,"id":1251},
222:{"name":"THAI_CHARSET","hex":"0xDE","decimal":222,"id":874},
238:{"name":"EE_CHARSET","hex":"0xEE","decimal":238,"id":1250},
255:{"name":"OEM_CHARSET","hex":"0xFF","decimal":255,"id":None},
}
if is_logger_on("RTFDE.text_extraction") is True:
log_text_extraction(f"Getting charset for {fcharsetN}")
charset = charsets.get(fcharsetN, None)
if charset is not None:
charset_id = charset.get('id', None)
return charset_id
return None
def get_default_font(tree: Tree) -> Union[str,None]:
"""Extract the font number controlword default font if it exists.
If an RTF file uses a default font, the default font number is specified with the \\deffN control word, which must precede the font-table group.
Args:
tree (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree object.
Returns:
The default font control number if it exists from the first `\\deffN`. None if not found.
"""
deff_gen = tree.scan_values(
lambda v: is_codeword_with_numeric_arg(v, b'\\deff')
)
deff_options = list(deff_gen)
try:
# We just want the first \\deffN. It shouldn't be set multiple times.
deff = deff_options[0]
deff_num = deff.value[5:]
return b'\\f' + deff_num
except IndexError:
return None
def parse_font_tree(font_tree: Tree) -> dict:
"""Create a font tree dictionary with appropriate codeces to decode text.
Args:
font_tree (Tree): The .rtf font table object decoded as a tree.
Returns:
A dictionary which maps font numbers to appropriate python codeces needed to decode text.
"""
parsed_font_tree = {}
for tree in font_tree.children:
if isinstance(tree, Tree):
fnum = None
fcharset = None
cpg_num = None
for tok in tree.children:
if is_codeword_with_numeric_arg(tok, b'\\f'):
fnum = tok.value
elif is_codeword_with_numeric_arg(tok, b'\\fcharset'):
fchar_num = int(tok.value[9:])
fcharset = get_codepage_num_from_fcharset(fchar_num)
elif is_codeword_with_numeric_arg(tok, b'\\cpg'):
cpg_num = int(tok.value[4:])
if fnum is not None:
# get the codepage
codepage_num = None
if fcharset is not None:
try:
codepage_num = check_codepage_num(fcharset)
except ValueError: # pragma: no cover
codepage_num = None
# if both \\fcharset and \\cpg appear in the font table, \\cpg is ignored.
if ((codepage_num is None) and (cpg_num is not None)):
try:
codepage_num = check_codepage_num(cpg_num)
except ValueError: # pragma: no cover
codepage_num = None
# Get the appropriate codec
if codepage_num is not None:
codec = get_python_codec(codepage_num)
else:
codec = None
# Only add if there is a font definition
tree_str = b"".join(list(flatten_tree_to_string_array(tree)))
parsed_font_tree[fnum] = fontdef(fnum, codepage_num, codec, tree_str)
return parsed_font_tree
def get_python_codec(codepage_num: int) -> str:
"""Returns the python codec needed to decode bytes to unicode.
Args:
codepage_num (int): A codepage number.
Returns:
The name of the codec in the Python codec registry. Used as the name for enacoding/decoding.
"""
text_codec = codepages.codepage2codec(codepage_num)
log.debug('Found python codec corresponding to code page {0}: {1}'.format(codepage_num, text_codec))
return text_codec
def check_codepage_num(codepage_num: int) -> int:
"""Provide the codepage number back to you if it is valid.
Args:
codepage_num (int): A possible codepage number.
Returns:
The codepage number IF it is a valid codepage number
Raises:
ValueError: The codepage_num provided isn't a valid codepage number.
"""
# This keyword should be emitted in the RTF header section right after the \ansi, \mac, \pc or \pca keyword. But, various document tags like \fbids often are thrown all over the header so we have to check the first group of headers for it.
# Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers
# Retrieved on 2020-12-18
allowed_codepage_nums = set([37, 437, 500, 708, 709, 710, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863, 864, 865, 866, 869, 870, 874, 875, 932, 936, 949, 950, 1026, 1047, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1200, 1201, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361, 10000, 10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10010, 10017, 10021, 10029, 10079, 10081, 10082, 12000, 12001, 20000, 20001, 20002, 20003, 20004, 20005, 20105, 20106, 20107, 20108, 20127, 20261, 20269, 20273, 20277, 20278, 20280, 20284, 20285, 20290, 20297, 20420, 20423, 20424, 20833, 20838, 20866, 20871, 20880, 20905, 20924, 20932, 20936, 20949, 21025, 21027, 21866, 28591, 28592, 28593, 28594, 28595, 28596, 28597, 28598, 28599, 28603, 28605, 29001, 38598, 50220, 50221, 50222, 50225, 50227, 50229, 50930, 50931, 50933, 50935, 50936, 50937, 50939, 51932, 51936, 51949, 51950, 52936, 54936, 57002, 57003, 57004, 57005, 57006, 57007, 57008, 57009, 57010, 57011, 65000, 65001])
if codepage_num in allowed_codepage_nums:
return codepage_num
# Note: If support for a specific codepage ever becomes an issue we can look at add support using the actual code-pages.
# Conversion tables for codepages can be retrieved from here: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/
raise ValueError(f"Unsupported unicode codepage number `{codepage_num}` found in the header")
def validate_ansi_cpg(header: str) -> None:
"""Check an '\\ansicpgNNNN' string to see if the number NNNN is an actual codepage.
Args:
header (str): The value from the lark `\\ansicpg` CONTROLWORD Token.
Raises:
MalformedRtf: If the value passed is not a valid ansi codepage.
"""
try:
possible_cpg_num = int(header.strip()[8:])
check_codepage_num(possible_cpg_num)
except ValueError as _e:
raise MalformedRtf(f"Unsupported unicode codepage number `{header}` found in the header") from _e
# UNICODE CHARS
def unicode_escape_to_chr(item: bytes) -> str:
"""Convert unicode char from it's decimal to its unicode character representation. From "\\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents.
Args:
item (str): A RTF Escape in the format \\u[-]NNNNN.
Returns:
The unicode character representation of the identified character
Raises:
ValueError: The escaped unicode character is not valid.
"""
try:
prefix = b'\\u'
if item.startswith(prefix):
nnnn = item[len(prefix):]
else:
nnnn = item
nnnn = int(nnnn) # raises ValueError if not int.
except ValueError as _e:
raise ValueError(f"`{item}` is not a valid escaped unicode character.") from _e
if nnnn < 0: # § -NNNNN is a negative integer expressed in decimal digits
ncr = 65536 + nnnn
else: # § NNNNN is a positive integer expressed in decimal digits
ncr = nnnn
# § HHHH is the hexadecimal equivalent of NNNNN or -NNNNN
return chr(ncr)
def is_hex_encoded(item: Token) -> bool:
"""Identify if a token contains a HEXENCODED token.
Args:
item (token): A token to check if it is HEXENCODED.
Return:
True if HEXENCODED. False if not.
"""
if isinstance(item, Token):
if item.type == "HEXENCODED":
return True
return False
def is_valid_ANSI_representation_char(item: Token) -> bool:
"""Is token contain a valid ANSI representation string for a Unicode char.
Args:
item (token): A token to check if it is a valid ANSI representation.
Return:
True if token is an ansi representation of a unicode char. False if not.
"""
if isinstance(item, Token):
# print(f"found TOKEN posssible ansi {repr(item)}")
if is_hex_encoded(item):
# print(f"found hex posssible ansi {repr(item)}")
return True
if item.type == 'STRING':
# print(f"found STRING posssible ansi {repr(item)}")
if not item.value.isspace(): # whitespace doesn't count.
# print(f"found posssible ansi {repr(item)}")
return True
# else:
# print(f"found SPACE posssible ansi {repr(item)}")
# print(f"found NON TOKEN posssible ansi {repr(item)}")
return False
def is_unicode_encoded(item: Token) -> bool:
"""Is token contain a unicode char.
Args:
item (token): A token to check if contains a unicode char.
Return:
True if token contains a unicode char. False if not.
"""
if isinstance(item, Token):
if item.type == "UNICODE":
return True
return False
def includes_unicode_chars(children: List[Token]) -> bool:
"""Does a list include Tokens which contain unicode characters. Not recursive.
Args:
children (list): A Tree.children list to check to see if it includes unicode characters.
Returns:
True if list includes tokens which contain unicode chars. False if not.
"""
for child in children:
if is_unicode_encoded(child):
return True
return False
def remove_unicode_replacements(children: List[Token],
return_ascii_map: bool = True,
byte_count: int = 1) -> Union[
Tuple[List[Token], Dict[Token,List[Token]]],
List[Token]]:
"""Remove all unicode replacement characters from a list of Tokens.
Args:
children (list): A Tree.children list to remove unicode replacement characters from.
return_ascii_map (bool): On True, have this function return a map of the ASCII token that were removed.
byte_count (int): The number of bytes corresponding to a given \\uN Unicode character. A default of 1 should be assumed if no \\uc keyword has been seen in the current or outer scopes.
Returns:
new_children (list): The list of Tokens with all unicode replacement characters removed.
ascii_map (dict): All the Tokens which were removed from the provided children list keyed by
"""
byte_count = 1
ascii_map: Dict[Token,List[Token]] = {}
new_children = []
removal_map: List[Token] = []
if is_logger_on("RTFDE.text_extraction") is True:
log_text_extraction(f"Removing unicode replacements on {repr(children)}")
for child in children:
if len(removal_map) > 0:
if isinstance(child, Token):
# Delete all spaces between a unicode char and the last ANSI representation
# print(f"FOUND SPACE STRING with RM: {removal_map}")
if child.value.isspace():
ascii_map.setdefault(removal_map[0], []).append(child)
continue
if is_valid_ANSI_representation_char(child):
# Found an ansi representation removing unicode char from removal map.
# print(f"FOUND ASCII STRING {child} to RM with RM: {removal_map}")
ascii_map.setdefault(removal_map.pop(), []).append(child)
continue
elif isinstance(child, Tree) and (
(child.data == "string") or (child.data == "hexarray")):
# print(f"FOUND ASCII STRING {child} with RM: {removal_map}")
ansi_children = child.children
new_ansi_children = []
for aci,ac in enumerate(ansi_children):
# print(f"AC CHILD {repr(ac)}")
if is_valid_ANSI_representation_char(ac):
# print(f"AC CHILD VALID {repr(ac)}")
if len(removal_map) > 0:
# print(f"AC CHILD MAP >0 {repr(ac)}")
# print(f"Popping removal for {repr(ac)}")
ascii_map.setdefault(removal_map.pop(), []).append(ac)
else:
# print(f"AC CHILD MAP < 0 {repr(ac)}")
new_ansi_children.append(ac)
else:
# print(f"AC CHILD NOT VALID {repr(ac)}")
new_ansi_children.append(ac)
# print(f"NEW Children = {new_ansi_children}")
if new_ansi_children == []:
from RTFDE.utils import make_token_replacement
# from RTFDE.utils import embed
# embed()
child = make_token_replacement("STRING", b"", child)
else:
child.children = new_ansi_children
# print(f"NEW Tree = {child}")
# else:
# print(f"FOUND ASCII STRING {child} with RM: {removal_map}")
# print(f"{repr(child)} not a valid ANSI representation? with RM: {removal_map}")
# Modify char byte count if we encounter it.
if is_unicode_char_byte_count(child):
byte_count = get_unicode_char_byte_count(child)
# print(f"Changing byte count because {child} to {byte_count}")
if is_unicode_encoded(child):
# print(f"Found unicode {child}")
for j in range(byte_count):
# Add the unicode key to the removal map once per byte
# This ensures we remove the right number of ANSI representation chars
removal_map.append(child)
new_children.append(child)
if return_ascii_map is True:
return new_children, ascii_map
return new_children
# UNICODE SURROGATE CHARACTERS
def is_surrogate_high_char(item: bytes) -> bool:
"""Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate.
item (bytes): A bytes representation of a string representing a unicode character. "\\u-10179"
"""
if item.startswith(b"\\u"):
item = item[2:]
if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF:
return True
# In case unicode is NOT using the 16 bit signed integer
elif 0xD800 <= int(item) <= 0xDBFF:
return True
return False
def is_surrogate_low_char(item: bytes) -> bool:
"""Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF." Low-surrogate also sometimes known as following surrogates.
item (bytes): A bytes representation of a string representing a unicode character.
"""
if item.startswith(b"\\u"):
item = item[2:]
if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF:
return True
# In case unicode is NOT using the 16 bit signed integer
elif 0xDC00 <= int(item) <= 0xDFFF:
return True
return False
def is_surrogate_16bit(item: bytes, cp_range) -> bool:
"""Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions.
Args:
item (bytes): A bytes representation of a string representing a unicode character.
cp_range (str): ['low' OR 'high'] The code point range (low-surrogate or high-surrogate).
"""
if cp_range == 'low':
if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF:
return True
elif cp_range == 'high':
if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF:
return True
else:
raise ValueError("cp_range must be either 'low' or 'high'")
return False
def is_surrogate_pair(first: bytes, second: bytes) -> bool:
"""Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order.
Args:
first (bytes): A bytes representation of a string representing the high-order byte in a surrogate char.
second (bytes): A bytes representation of a string representing the low-order byte in a surrogate char.
"""
if is_surrogate_high_char(first):
if is_surrogate_low_char(second):
return True
else:
log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(first, second))
return False
def decode_surrogate_pair(high: bytes, low: bytes, encoding: str ='utf-16-le') -> bytes:
""" Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent.
Args:
high (bytes): the high-surrogate code point
low (bytes): the low-surrogate code point
encoding (str): The encoding to apply to the final value. Defaults to 'utf-16-le' because: Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal.
"""
# Equation for turning surrogate pairs into a unicode scalar value which be used with utl-16 can ONLY found in Unicode 3.0.0 standard.
# Unicode scalar value means the same thing as "code position" or "code point"
# https://www.unicode.org/versions/Unicode3.0.0/
# section 3.7 https://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#page=9
if high.startswith(b"\\u"):
high = high[2:]
if low.startswith(b"\\u"):
low = low[2:]
if is_surrogate_16bit(high, "high"):
char_high = chr(65536+int(high))
else:
char_high = chr(int(high))
if is_surrogate_16bit(low, "low"):
char_low = chr(65536+int(low))
else:
char_low = chr(int(low))
unicode_scalar_value = ((ord(char_high) - 0xD800) * 0x400) + (ord(char_low) - 0xDC00) + 0x10000
unicode_bytes = chr(unicode_scalar_value).encode(encoding)
return unicode_bytes.decode(encoding).encode()
def merge_surrogate_chars(children,
ascii_map,
use_ASCII_alternatives_on_unicode_decode_failure = False):
"""
Raises:
ValueError: A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character.
"""
surrogate_start = None
surrogate_high = None
for i,c in enumerate(children):
if isinstance(c, Tree):
continue
if is_unicode_encoded(c):
if is_surrogate_high_char(c.value):
surrogate_start = i
surrogate_high = c
elif surrogate_start is not None:
if is_surrogate_low_char(c.value):
surrogate_low = c
try:
surrogate_value = decode_surrogate_pair(surrogate_high.value,
surrogate_low.value)
# Convert into STRING token
surrogate_tok = Token('STRING',
surrogate_value,
start_pos=surrogate_high.start_pos,
end_pos=surrogate_low.end_pos,
line=surrogate_high.line,
end_line=surrogate_low.end_line,
column=surrogate_high.column,
end_column=surrogate_low.end_column)
children[surrogate_start] = surrogate_tok
blank_tok = Token('STRING',
b"",
start_pos=surrogate_high.start_pos+1,
end_pos=surrogate_low.end_pos+1,
line=surrogate_high.line,
end_line=surrogate_low.end_line,
column=surrogate_high.column,
end_column=surrogate_low.end_column)
children[i] = blank_tok
surrogate_start = None
surrogate_high = None
except UnicodeDecodeError as _e:
if use_ASCII_alternatives_on_unicode_decode_failure is True:
children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]])
children[i] = b"".join([i.value for i in ascii_map[surrogate_low]])
else:
raise _e
else:
log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(surrogate_high, surrogate_low))
if use_ASCII_alternatives_on_unicode_decode_failure is True:
children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]])
else:
raise ValueError("Standalone high-surrogate found. High surrogate followed by a illegal low-surrogate character.")
return children
def is_unicode_char_byte_count(item: Token) -> bool:
if isinstance(item, Token):
if item.type == "CONTROLWORD":
if item.value.startswith(b'\\uc'):
return True
return False
def get_unicode_char_byte_count(item: Token) -> int:
item = item.value.decode()
cur_uc = int(item[3:])
return cur_uc
# Hex Encoded Chars
def has_hexarray(children: List[Union[Token, Tree]]) -> bool:
"""Checks if an tree's children includes a hexarray tree.
children (array): the children object from a tree.
"""
for item in children:
if is_hexarray(item):
return True
return False
def is_hexarray(item):
"""Checks if an item is a hexarray tree.
item (Tree or Token): an item to check to see if its a hex array
"""
if isinstance(item, Tree):
if item.data.value == 'hexarray':
return True
return False
def get_bytes_from_hex_encoded(item):
"""Convert hex encoded string to bytes.
item (str): a hex encoded string in format \\'XX
"""
hexstring = item.replace(b"\\'", b"")
hex_bytes = bytes.fromhex(hexstring.decode())
return hex_bytes
def decode_hex_char(item, codec):
"""Decode a bytes object using a specified codec.
item (bytes): A bytes object.
codec (str): The name of the codec to use to decode the bytes
"""
if is_logger_on("RTFDE.text_extraction") is True:
log_text_extraction("decoding char {0} with font {1}".format(item, codec))
if codec is None:
# Default to U.S. Windows default codepage
codec = 'CP1252'
decoded = item.decode(codec)
decoded = decoded.encode()
if is_logger_on("RTFDE.text_extraction") is True:
log_text_extraction("char {0} decoded into {1} using codec {2}".format(item, decoded, codec))
return decoded
class TextDecoder:
def __init__(self, keep_fontdef=False,
initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False):
"""
keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed.
initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the information.
use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for.
"""
self.keep_fontdef = keep_fontdef
self.ucbc = initial_byte_count
self.use_ASCII_alternatives_on_unicode_decode_failure = use_ASCII_alternatives_on_unicode_decode_failure
# Font table values set set_font_info
self.default_font = None
self.font_stack = []
self.font_table = {}
def set_font_info(self, obj: Tree):
"""
obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
"""
self.default_font = get_default_font(obj)
self.font_stack = [self.default_font]
raw_fonttbl = get_font_table(obj.children[1])
self.font_table = parse_font_tree(raw_fonttbl)
if is_logger_on("RTFDE.text_extraction") is True:
log_text_extraction(f"FONT TABLE FOUND: {raw_fonttbl}")
def update_children(self, obj: Tree):
"""
obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
"""
# Reset font info
self.set_font_info(obj)
children = obj.children
obj.children = [i for i in self.iterate_on_children(children)]
def prep_unicode(self, children: List[Token]):
if includes_unicode_chars(children):
# Clean out all replacement chars
# log_text_extraction("Prepping Unicode Chars:" + repr(children))
children, ascii_map = remove_unicode_replacements(children,
byte_count=self.ucbc)
# print("===\nCHILD:" + repr(children))
# print("===\nASCII:" + repr(ascii_map))
# Merge all surrogate pairs
children = merge_surrogate_chars(children,
ascii_map,
self.use_ASCII_alternatives_on_unicode_decode_failure)
# print("FINAL CHILDREN")
# log_text_extraction("Replaced Unicode Chars With: " + repr(children))
return children
def iterate_on_children(self, children): # Children should be 'List[Union[Token,Tree]]' but lark's Tree typing is defined badly.
set_fonts = []
if is_logger_on("RTFDE.text_extraction") is True:
log_text_extraction("Starting to iterate on text extraction children...")
log_text_extraction("PREP-BEFORE: "+repr(children))
children = self.prep_unicode(children)
if is_logger_on("RTFDE.text_extraction") is True:
log_text_extraction("PREP-AFTER: "+repr(children))
for item in children:
if is_font_number(item): # Font Definitions
self.font_stack.append(item.value.strip())
set_fonts.append(item.value)
if self.keep_fontdef is True:
yield item
elif is_unicode_char_byte_count(item):
bc = get_unicode_char_byte_count(item)
elif is_unicode_encoded(item): # Unicode Chars
decoded = unicode_escape_to_chr(item.value).encode()
# Convert into STRING token
decoded_tok = Token('STRING',
decoded,
start_pos=item.start_pos,
end_pos=item.end_pos,
line=item.line,
end_line=item.end_line,
column=item.column,
end_column=item.end_column)
if is_logger_on("RTFDE.text_extraction") is True:
log_text_extraction(f"UNICODE TOKEN {item}: {decoded_tok}")
yield decoded_tok
# Decode a hex array
elif is_hexarray(item):
# print("IS Hex?? {0}".format(item))
base_bytes = None
for hexchild in item.children:
if base_bytes is None:
base_bytes = get_bytes_from_hex_encoded(hexchild.value)
else:
base_bytes += get_bytes_from_hex_encoded(hexchild.value)
current_fontdef = self.font_table[self.font_stack[-1]]
current_codec = current_fontdef.codec
decoded_hex = decode_hex_char(base_bytes, current_codec)
# We are replacing a Tree. So, need item.data to access it's info token
decoded_hex_tok = Token('STRING',
decoded_hex,
start_pos=item.data.start_pos,
end_pos=item.data.end_pos,
line=item.data.line,
end_line=item.data.end_line,
column=item.data.column,
end_column=item.data.end_column)
yield decoded_hex_tok
elif isinstance(item, Tree):
# Run this same function recursively on nested trees
item.children = [i for i in self.iterate_on_children(item.children)]
yield item
else:
yield item
for i in set_fonts:
# Remove all fonts defined while in this group
self.font_stack.pop()
Functions
def check_codepage_num(codepage_num: int) ‑> int
-
Provide the codepage number back to you if it is valid.
Args
codepage_num
:int
- A possible codepage number.
Returns
The codepage number IF it is a valid codepage number
Raises
ValueError
- The codepage_num provided isn't a valid codepage number.
Expand source code
def check_codepage_num(codepage_num: int) -> int: """Provide the codepage number back to you if it is valid. Args: codepage_num (int): A possible codepage number. Returns: The codepage number IF it is a valid codepage number Raises: ValueError: The codepage_num provided isn't a valid codepage number. """ # This keyword should be emitted in the RTF header section right after the \ansi, \mac, \pc or \pca keyword. But, various document tags like \fbids often are thrown all over the header so we have to check the first group of headers for it. # Code page names from https://docs.microsoft.com/en-gb/windows/desktop/Intl/code-page-identifiers # Retrieved on 2020-12-18 allowed_codepage_nums = set([37, 437, 500, 708, 709, 710, 720, 737, 775, 850, 852, 855, 857, 858, 860, 861, 862, 863, 864, 865, 866, 869, 870, 874, 875, 932, 936, 949, 950, 1026, 1047, 1140, 1141, 1142, 1143, 1144, 1145, 1146, 1147, 1148, 1149, 1200, 1201, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1361, 10000, 10001, 10002, 10003, 10004, 10005, 10006, 10007, 10008, 10010, 10017, 10021, 10029, 10079, 10081, 10082, 12000, 12001, 20000, 20001, 20002, 20003, 20004, 20005, 20105, 20106, 20107, 20108, 20127, 20261, 20269, 20273, 20277, 20278, 20280, 20284, 20285, 20290, 20297, 20420, 20423, 20424, 20833, 20838, 20866, 20871, 20880, 20905, 20924, 20932, 20936, 20949, 21025, 21027, 21866, 28591, 28592, 28593, 28594, 28595, 28596, 28597, 28598, 28599, 28603, 28605, 29001, 38598, 50220, 50221, 50222, 50225, 50227, 50229, 50930, 50931, 50933, 50935, 50936, 50937, 50939, 51932, 51936, 51949, 51950, 52936, 54936, 57002, 57003, 57004, 57005, 57006, 57007, 57008, 57009, 57010, 57011, 65000, 65001]) if codepage_num in allowed_codepage_nums: return codepage_num # Note: If support for a specific codepage ever becomes an issue we can look at add support using the actual code-pages. # Conversion tables for codepages can be retrieved from here: https://www.unicode.org/Public/MAPPINGS/VENDORS/MICSFT/ raise ValueError(f"Unsupported unicode codepage number `{codepage_num}` found in the header")
def decode_hex_char(item, codec)
-
Decode a bytes object using a specified codec.
item (bytes): A bytes object. codec (str): The name of the codec to use to decode the bytes
Expand source code
def decode_hex_char(item, codec): """Decode a bytes object using a specified codec. item (bytes): A bytes object. codec (str): The name of the codec to use to decode the bytes """ if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction("decoding char {0} with font {1}".format(item, codec)) if codec is None: # Default to U.S. Windows default codepage codec = 'CP1252' decoded = item.decode(codec) decoded = decoded.encode() if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction("char {0} decoded into {1} using codec {2}".format(item, decoded, codec)) return decoded
def decode_surrogate_pair(high: bytes, low: bytes, encoding: str = 'utf-16-le') ‑> bytes
-
Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent.
Args
high
:bytes
- the high-surrogate code point
low
:bytes
- the low-surrogate code point
encoding
:str
- The encoding to apply to the final value. Defaults to 'utf-16-le' because: Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal.
Expand source code
def decode_surrogate_pair(high: bytes, low: bytes, encoding: str ='utf-16-le') -> bytes: """ Convert a pair of surrogate chars into the corresponding utf-16 encoded text string they should represent. Args: high (bytes): the high-surrogate code point low (bytes): the low-surrogate code point encoding (str): The encoding to apply to the final value. Defaults to 'utf-16-le' because: Microsoft uses UTF-16, little endian byte order. ( https://learn.microsoft.com/en-us/windows/win32/intl/using-byte-order-marks ) The Msg format is a Microsoft standard. Therefore, man is mortal. """ # Equation for turning surrogate pairs into a unicode scalar value which be used with utl-16 can ONLY found in Unicode 3.0.0 standard. # Unicode scalar value means the same thing as "code position" or "code point" # https://www.unicode.org/versions/Unicode3.0.0/ # section 3.7 https://www.unicode.org/versions/Unicode3.0.0/ch03.pdf#page=9 if high.startswith(b"\\u"): high = high[2:] if low.startswith(b"\\u"): low = low[2:] if is_surrogate_16bit(high, "high"): char_high = chr(65536+int(high)) else: char_high = chr(int(high)) if is_surrogate_16bit(low, "low"): char_low = chr(65536+int(low)) else: char_low = chr(int(low)) unicode_scalar_value = ((ord(char_high) - 0xD800) * 0x400) + (ord(char_low) - 0xDC00) + 0x10000 unicode_bytes = chr(unicode_scalar_value).encode(encoding) return unicode_bytes.decode(encoding).encode()
def get_bytes_from_hex_encoded(item)
-
Convert hex encoded string to bytes.
item (str): a hex encoded string in format 'XX
Expand source code
def get_bytes_from_hex_encoded(item): """Convert hex encoded string to bytes. item (str): a hex encoded string in format \\'XX """ hexstring = item.replace(b"\\'", b"") hex_bytes = bytes.fromhex(hexstring.decode()) return hex_bytes
def get_codepage_num_from_fcharset(fcharsetN: int) ‑> Optional[int]
-
Return the codepage to use with a specific fcharsetN.
Args
fcharsetN
:int
- The numeric argument N for a charsetN control word.
Returns
(int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.)
Expand source code
def get_codepage_num_from_fcharset(fcharsetN: int) -> Union[int,None]: """Return the codepage to use with a specific fcharsetN. Args: fcharsetN (int): The numeric argument N for a \fcharsetN control word. Returns: (int OR None) Returns the int for a codepage if known. Returns None for unknown charsets or charsets with no corresponding codepage (such as OEM or DEFAULT.) """ # Charset table retrieved on 2022-08-19 # https://web.archive.org/web/20220819215334/https://docs.microsoft.com/en-us/previous-versions/cc194829%28v=msdn.10%29?redirectedfrom=MSDN charsets: dict[int,dict[str,Any]] = { 0:{"name":"ANSI_CHARSET","hex":"0x00","decimal":0,"id":1252}, 1:{"name":"DEFAULT_CHARSET","hex":"0x01","decimal":1,"id":None}, 2:{"name":"SYMBOL_CHARSET","hex":"0x02","decimal":2,"id":None}, 128:{"name":"SHIFTJIS_CHARSET","hex":"0x80","decimal":128,"id":932}, 129:{"name":"HANGUL_CHARSET","hex":"0x81","decimal":129,"id":949}, 134:{"name":"GB2312_CHARSET","hex":"0x86","decimal":134,"id":936}, 136:{"name":"CHINESEBIG5_CHARSET","hex":"0x88","decimal":136,"id":950}, 161:{"name":"GREEK_CHARSET","hex":"0xA1","decimal":161,"id":1253}, 162:{"name":"TURKISH_CHARSET","hex":"0xA2","decimal":162,"id":1254}, 177:{"name":"HEBREW_CHARSET","hex":"0xB1","decimal":177,"id":1255}, 178:{"name":"ARABIC_CHARSET","hex":"0xB2","decimal":178,"id":1256}, 186:{"name":"BALTIC_CHARSET","hex":"0xBA","decimal":186,"id":1257}, 204:{"name":"RUSSIAN_CHARSET","hex":"0xCC","decimal":204,"id":1251}, 222:{"name":"THAI_CHARSET","hex":"0xDE","decimal":222,"id":874}, 238:{"name":"EE_CHARSET","hex":"0xEE","decimal":238,"id":1250}, 255:{"name":"OEM_CHARSET","hex":"0xFF","decimal":255,"id":None}, } if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction(f"Getting charset for {fcharsetN}") charset = charsets.get(fcharsetN, None) if charset is not None: charset_id = charset.get('id', None) return charset_id return None
def get_default_font(tree: lark.tree.Tree) ‑> Optional[str]
-
Extract the font number controlword default font if it exists.
If an RTF file uses a default font, the default font number is specified with the \deffN control word, which must precede the font-table group.
Args
tree
:Tree
- A lark Tree object. Should be the DeEncapsulator.full_tree object.
Returns
The default font control number if it exists from the first
\deffN
. None if not found.Expand source code
def get_default_font(tree: Tree) -> Union[str,None]: """Extract the font number controlword default font if it exists. If an RTF file uses a default font, the default font number is specified with the \\deffN control word, which must precede the font-table group. Args: tree (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree object. Returns: The default font control number if it exists from the first `\\deffN`. None if not found. """ deff_gen = tree.scan_values( lambda v: is_codeword_with_numeric_arg(v, b'\\deff') ) deff_options = list(deff_gen) try: # We just want the first \\deffN. It shouldn't be set multiple times. deff = deff_options[0] deff_num = deff.value[5:] return b'\\f' + deff_num except IndexError: return None
def get_font_table(tree: lark.tree.Tree) ‑> lark.tree.Tree
-
Extract the font table group from the first 20 tokens of a .rtf document.
Args
tree
:Tree
- A .rtf document object parsed into a Tree object
Raises
ValueError
- If no group with a
\fonttbl
token as its first controlword is found.
Returns
{'\f0': fontdef(fnum='\f0', codepage=932, codec='cp932', fontdef_tree='{\f0\fswiss\fcharset128 MS PGothic;}'), '\f1': fontdef(fnum='\f1', codepage=None, codec=None, fontdef_tree='{\f1\fmodern MS Gothic;}'), '\f2': fontdef(fnum='\f2', codepage=None, codec=None, fontdef_tree='{\f2\fnil\fcharset2 Symbol;}'), '\f3': fontdef(fnum='\f3', codepage=1252, codec='cp1252', fontdef_tree='{\f3\fmodern\fcharset0 Courier New;}'), '\f4': fontdef(fnum='\f4', codepage=932, codec='cp932', fontdef_tree='{\f4\fswiss\fcharset128 "PMingLiU";}'), '\f5': fontdef(fnum='\f5', codepage=None, codec=None, fontdef_tree='{\f5\fswiss "Amnesty Trade Gothic";}'), '\f6': fontdef(fnum='\f6', codepage=None, codec=None, fontdef_tree='{\f6\fswiss "Arial";}')}
Expand source code
def get_font_table(tree: Tree) -> Tree: """Extract the font table group from the first 20 tokens of a .rtf document. Args: tree (Tree): A .rtf document object parsed into a Tree object Raises: ValueError: If no group with a `\\fonttbl` token as its first controlword is found. Returns: {'\\f0': fontdef(fnum='\\f0', codepage=932, codec='cp932', fontdef_tree='{\\f0\\fswiss\\fcharset128 MS PGothic;}'), '\\f1': fontdef(fnum='\\f1', codepage=None, codec=None, fontdef_tree='{\\f1\\fmodern MS Gothic;}'), '\\f2': fontdef(fnum='\\f2', codepage=None, codec=None, fontdef_tree='{\\f2\\fnil\\fcharset2 Symbol;}'), '\\f3': fontdef(fnum='\\f3', codepage=1252, codec='cp1252', fontdef_tree='{\\f3\\fmodern\\fcharset0 Courier New;}'), '\\f4': fontdef(fnum='\\f4', codepage=932, codec='cp932', fontdef_tree='{\\f4\\fswiss\\fcharset128 "PMingLiU";}'), '\\f5': fontdef(fnum='\\f5', codepage=None, codec=None, fontdef_tree='{\\f5\\fswiss "Amnesty Trade Gothic";}'), '\\f6': fontdef(fnum='\\f6', codepage=None, codec=None, fontdef_tree='{\\f6\\fswiss "Arial";}')} """ for item in tree.children[:20]: if isinstance(item, Tree): try: ctrl_value = item.children[1] except IndexError as _e: continue if isinstance(ctrl_value, Token): table_type = ctrl_value.value.strip() if table_type == b"\\fonttbl": return item raise ValueError("No font table found in tree")
def get_python_codec(codepage_num: int) ‑> str
-
Returns the python codec needed to decode bytes to unicode.
Args
codepage_num
:int
- A codepage number.
Returns
The name of the codec in the Python codec registry. Used as the name for enacoding/decoding.
Expand source code
def get_python_codec(codepage_num: int) -> str: """Returns the python codec needed to decode bytes to unicode. Args: codepage_num (int): A codepage number. Returns: The name of the codec in the Python codec registry. Used as the name for enacoding/decoding. """ text_codec = codepages.codepage2codec(codepage_num) log.debug('Found python codec corresponding to code page {0}: {1}'.format(codepage_num, text_codec)) return text_codec
def get_unicode_char_byte_count(item: lark.lexer.Token) ‑> int
-
Expand source code
def get_unicode_char_byte_count(item: Token) -> int: item = item.value.decode() cur_uc = int(item[3:]) return cur_uc
def has_hexarray(children: List[Union[lark.lexer.Token, lark.tree.Tree]]) ‑> bool
-
Checks if an tree's children includes a hexarray tree.
children (array): the children object from a tree.
Expand source code
def has_hexarray(children: List[Union[Token, Tree]]) -> bool: """Checks if an tree's children includes a hexarray tree. children (array): the children object from a tree. """ for item in children: if is_hexarray(item): return True return False
def includes_unicode_chars(children: List[lark.lexer.Token]) ‑> bool
-
Does a list include Tokens which contain unicode characters. Not recursive.
Args
children
:list
- A Tree.children list to check to see if it includes unicode characters.
Returns
True if list includes tokens which contain unicode chars. False if not.
Expand source code
def includes_unicode_chars(children: List[Token]) -> bool: """Does a list include Tokens which contain unicode characters. Not recursive. Args: children (list): A Tree.children list to check to see if it includes unicode characters. Returns: True if list includes tokens which contain unicode chars. False if not. """ for child in children: if is_unicode_encoded(child): return True return False
def is_font_number(token: lark.lexer.Token) ‑> bool
-
Checks if an object is a "font number".
Returns
True if an object is a "font number" controlword
\fN
. False if not.Expand source code
def is_font_number(token: Token) -> bool: """Checks if an object is a "font number". Returns: True if an object is a "font number" controlword `\\fN`. False if not. """ try: if is_codeword_with_numeric_arg(token, b'\\f'): return True except AttributeError: # pragma: no cover return False return False
def is_hex_encoded(item: lark.lexer.Token) ‑> bool
-
Identify if a token contains a HEXENCODED token.
Args
item
:token
- A token to check if it is HEXENCODED.
Return
True if HEXENCODED. False if not.
Expand source code
def is_hex_encoded(item: Token) -> bool: """Identify if a token contains a HEXENCODED token. Args: item (token): A token to check if it is HEXENCODED. Return: True if HEXENCODED. False if not. """ if isinstance(item, Token): if item.type == "HEXENCODED": return True return False
def is_hexarray(item)
-
Checks if an item is a hexarray tree.
item (Tree or Token): an item to check to see if its a hex array
Expand source code
def is_hexarray(item): """Checks if an item is a hexarray tree. item (Tree or Token): an item to check to see if its a hex array """ if isinstance(item, Tree): if item.data.value == 'hexarray': return True return False
def is_surrogate_16bit(item: bytes, cp_range) ‑> bool
-
Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions.
Args
item
:bytes
- A bytes representation of a string representing a unicode character.
cp_range
:str
- ['low' OR 'high'] The code point range (low-surrogate or high-surrogate).
Expand source code
def is_surrogate_16bit(item: bytes, cp_range) -> bool: """Checks if a unicode char is 16 bit signed integer or the raw unicode char. This should first check if it is a surrogate code using the is_surrogate_XXXX_char functions. Args: item (bytes): A bytes representation of a string representing a unicode character. cp_range (str): ['low' OR 'high'] The code point range (low-surrogate or high-surrogate). """ if cp_range == 'low': if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF: return True elif cp_range == 'high': if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF: return True else: raise ValueError("cp_range must be either 'low' or 'high'") return False
def is_surrogate_high_char(item: bytes) ‑> bool
-
Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate.
item (bytes): A bytes representation of a string representing a unicode character. "\u-10179"
Expand source code
def is_surrogate_high_char(item: bytes) -> bool: """Check's if chr is a is in the high-surrogate code point rage. "High-surrogate code point: A Unicode code point in the range U+D800 to U+DBFF." High-surrogate also sometimes known as the leading surrogate. item (bytes): A bytes representation of a string representing a unicode character. "\\u-10179" """ if item.startswith(b"\\u"): item = item[2:] if 0xD800 <= ord(chr(65536+int(item))) <= 0xDBFF: return True # In case unicode is NOT using the 16 bit signed integer elif 0xD800 <= int(item) <= 0xDBFF: return True return False
def is_surrogate_low_char(item: bytes) ‑> bool
-
Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF." Low-surrogate also sometimes known as following surrogates.
item (bytes): A bytes representation of a string representing a unicode character.
Expand source code
def is_surrogate_low_char(item: bytes) -> bool: """Check's if chr is a is in the low-surrogate code point rage. "Low-surrogate code point: A Unicode code point in the range U+DC00 to U+DFFF." Low-surrogate also sometimes known as following surrogates. item (bytes): A bytes representation of a string representing a unicode character. """ if item.startswith(b"\\u"): item = item[2:] if 0xDC00 <= ord(chr(65536+int(item))) <= 0xDFFF: return True # In case unicode is NOT using the 16 bit signed integer elif 0xDC00 <= int(item) <= 0xDFFF: return True return False
def is_surrogate_pair(first: bytes, second: bytes) ‑> bool
-
Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order.
Args
first
:bytes
- A bytes representation of a string representing the high-order byte in a surrogate char.
second
:bytes
- A bytes representation of a string representing the low-order byte in a surrogate char.
Expand source code
def is_surrogate_pair(first: bytes, second: bytes) -> bool: """Check if a pair of unicode characters are a surrogate pair. Must be passed in the correct order. Args: first (bytes): A bytes representation of a string representing the high-order byte in a surrogate char. second (bytes): A bytes representation of a string representing the low-order byte in a surrogate char. """ if is_surrogate_high_char(first): if is_surrogate_low_char(second): return True else: log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(first, second)) return False
def is_unicode_char_byte_count(item: lark.lexer.Token) ‑> bool
-
Expand source code
def is_unicode_char_byte_count(item: Token) -> bool: if isinstance(item, Token): if item.type == "CONTROLWORD": if item.value.startswith(b'\\uc'): return True return False
def is_unicode_encoded(item: lark.lexer.Token) ‑> bool
-
Is token contain a unicode char.
Args
item
:token
- A token to check if contains a unicode char.
Return
True if token contains a unicode char. False if not.
Expand source code
def is_unicode_encoded(item: Token) -> bool: """Is token contain a unicode char. Args: item (token): A token to check if contains a unicode char. Return: True if token contains a unicode char. False if not. """ if isinstance(item, Token): if item.type == "UNICODE": return True return False
def is_valid_ANSI_representation_char(item: lark.lexer.Token) ‑> bool
-
Is token contain a valid ANSI representation string for a Unicode char.
Args
item
:token
- A token to check if it is a valid ANSI representation.
Return
True if token is an ansi representation of a unicode char. False if not.
Expand source code
def is_valid_ANSI_representation_char(item: Token) -> bool: """Is token contain a valid ANSI representation string for a Unicode char. Args: item (token): A token to check if it is a valid ANSI representation. Return: True if token is an ansi representation of a unicode char. False if not. """ if isinstance(item, Token): # print(f"found TOKEN posssible ansi {repr(item)}") if is_hex_encoded(item): # print(f"found hex posssible ansi {repr(item)}") return True if item.type == 'STRING': # print(f"found STRING posssible ansi {repr(item)}") if not item.value.isspace(): # whitespace doesn't count. # print(f"found posssible ansi {repr(item)}") return True # else: # print(f"found SPACE posssible ansi {repr(item)}") # print(f"found NON TOKEN posssible ansi {repr(item)}") return False
def merge_surrogate_chars(children, ascii_map, use_ASCII_alternatives_on_unicode_decode_failure=False)
-
Raises
ValueError
- A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character.
Expand source code
def merge_surrogate_chars(children, ascii_map, use_ASCII_alternatives_on_unicode_decode_failure = False): """ Raises: ValueError: A Standalone high-surrogate was found. High surrogate followed by a illegal low-surrogate character. """ surrogate_start = None surrogate_high = None for i,c in enumerate(children): if isinstance(c, Tree): continue if is_unicode_encoded(c): if is_surrogate_high_char(c.value): surrogate_start = i surrogate_high = c elif surrogate_start is not None: if is_surrogate_low_char(c.value): surrogate_low = c try: surrogate_value = decode_surrogate_pair(surrogate_high.value, surrogate_low.value) # Convert into STRING token surrogate_tok = Token('STRING', surrogate_value, start_pos=surrogate_high.start_pos, end_pos=surrogate_low.end_pos, line=surrogate_high.line, end_line=surrogate_low.end_line, column=surrogate_high.column, end_column=surrogate_low.end_column) children[surrogate_start] = surrogate_tok blank_tok = Token('STRING', b"", start_pos=surrogate_high.start_pos+1, end_pos=surrogate_low.end_pos+1, line=surrogate_high.line, end_line=surrogate_low.end_line, column=surrogate_high.column, end_column=surrogate_low.end_column) children[i] = blank_tok surrogate_start = None surrogate_high = None except UnicodeDecodeError as _e: if use_ASCII_alternatives_on_unicode_decode_failure is True: children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]]) children[i] = b"".join([i.value for i in ascii_map[surrogate_low]]) else: raise _e else: log.info("RTFDE encountered a standalone high-surrogate point without a corresponding low-surrogate. Standalone surrogate code points have either a high surrogate without an adjacent low surrogate, or vice versa. These code points are invalid and are not supported. Their behavior is undefined. Codepoints encountered: {0}, {1}".format(surrogate_high, surrogate_low)) if use_ASCII_alternatives_on_unicode_decode_failure is True: children[surrogate_start] = b"".join([i.value for i in ascii_map[surrogate_high]]) else: raise ValueError("Standalone high-surrogate found. High surrogate followed by a illegal low-surrogate character.") return children
def parse_font_tree(font_tree: lark.tree.Tree) ‑> dict
-
Create a font tree dictionary with appropriate codeces to decode text.
Args
font_tree
:Tree
- The .rtf font table object decoded as a tree.
Returns
A dictionary which maps font numbers to appropriate python codeces needed to decode text.
Expand source code
def parse_font_tree(font_tree: Tree) -> dict: """Create a font tree dictionary with appropriate codeces to decode text. Args: font_tree (Tree): The .rtf font table object decoded as a tree. Returns: A dictionary which maps font numbers to appropriate python codeces needed to decode text. """ parsed_font_tree = {} for tree in font_tree.children: if isinstance(tree, Tree): fnum = None fcharset = None cpg_num = None for tok in tree.children: if is_codeword_with_numeric_arg(tok, b'\\f'): fnum = tok.value elif is_codeword_with_numeric_arg(tok, b'\\fcharset'): fchar_num = int(tok.value[9:]) fcharset = get_codepage_num_from_fcharset(fchar_num) elif is_codeword_with_numeric_arg(tok, b'\\cpg'): cpg_num = int(tok.value[4:]) if fnum is not None: # get the codepage codepage_num = None if fcharset is not None: try: codepage_num = check_codepage_num(fcharset) except ValueError: # pragma: no cover codepage_num = None # if both \\fcharset and \\cpg appear in the font table, \\cpg is ignored. if ((codepage_num is None) and (cpg_num is not None)): try: codepage_num = check_codepage_num(cpg_num) except ValueError: # pragma: no cover codepage_num = None # Get the appropriate codec if codepage_num is not None: codec = get_python_codec(codepage_num) else: codec = None # Only add if there is a font definition tree_str = b"".join(list(flatten_tree_to_string_array(tree))) parsed_font_tree[fnum] = fontdef(fnum, codepage_num, codec, tree_str) return parsed_font_tree
def remove_unicode_replacements(children: List[lark.lexer.Token], return_ascii_map: bool = True, byte_count: int = 1) ‑> Union[Tuple[List[lark.lexer.Token], Dict[lark.lexer.Token, List[lark.lexer.Token]]], List[lark.lexer.Token]]
-
Remove all unicode replacement characters from a list of Tokens.
Args
children
:list
- A Tree.children list to remove unicode replacement characters from.
return_ascii_map
:bool
- On True, have this function return a map of the ASCII token that were removed.
byte_count
:int
- The number of bytes corresponding to a given \uN Unicode character. A default of 1 should be assumed if no \uc keyword has been seen in the current or outer scopes.
Returns
new_children (list): The list of Tokens with all unicode replacement characters removed. ascii_map (dict): All the Tokens which were removed from the provided children list keyed by
Expand source code
def remove_unicode_replacements(children: List[Token], return_ascii_map: bool = True, byte_count: int = 1) -> Union[ Tuple[List[Token], Dict[Token,List[Token]]], List[Token]]: """Remove all unicode replacement characters from a list of Tokens. Args: children (list): A Tree.children list to remove unicode replacement characters from. return_ascii_map (bool): On True, have this function return a map of the ASCII token that were removed. byte_count (int): The number of bytes corresponding to a given \\uN Unicode character. A default of 1 should be assumed if no \\uc keyword has been seen in the current or outer scopes. Returns: new_children (list): The list of Tokens with all unicode replacement characters removed. ascii_map (dict): All the Tokens which were removed from the provided children list keyed by """ byte_count = 1 ascii_map: Dict[Token,List[Token]] = {} new_children = [] removal_map: List[Token] = [] if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction(f"Removing unicode replacements on {repr(children)}") for child in children: if len(removal_map) > 0: if isinstance(child, Token): # Delete all spaces between a unicode char and the last ANSI representation # print(f"FOUND SPACE STRING with RM: {removal_map}") if child.value.isspace(): ascii_map.setdefault(removal_map[0], []).append(child) continue if is_valid_ANSI_representation_char(child): # Found an ansi representation removing unicode char from removal map. # print(f"FOUND ASCII STRING {child} to RM with RM: {removal_map}") ascii_map.setdefault(removal_map.pop(), []).append(child) continue elif isinstance(child, Tree) and ( (child.data == "string") or (child.data == "hexarray")): # print(f"FOUND ASCII STRING {child} with RM: {removal_map}") ansi_children = child.children new_ansi_children = [] for aci,ac in enumerate(ansi_children): # print(f"AC CHILD {repr(ac)}") if is_valid_ANSI_representation_char(ac): # print(f"AC CHILD VALID {repr(ac)}") if len(removal_map) > 0: # print(f"AC CHILD MAP >0 {repr(ac)}") # print(f"Popping removal for {repr(ac)}") ascii_map.setdefault(removal_map.pop(), []).append(ac) else: # print(f"AC CHILD MAP < 0 {repr(ac)}") new_ansi_children.append(ac) else: # print(f"AC CHILD NOT VALID {repr(ac)}") new_ansi_children.append(ac) # print(f"NEW Children = {new_ansi_children}") if new_ansi_children == []: from RTFDE.utils import make_token_replacement # from RTFDE.utils import embed # embed() child = make_token_replacement("STRING", b"", child) else: child.children = new_ansi_children # print(f"NEW Tree = {child}") # else: # print(f"FOUND ASCII STRING {child} with RM: {removal_map}") # print(f"{repr(child)} not a valid ANSI representation? with RM: {removal_map}") # Modify char byte count if we encounter it. if is_unicode_char_byte_count(child): byte_count = get_unicode_char_byte_count(child) # print(f"Changing byte count because {child} to {byte_count}") if is_unicode_encoded(child): # print(f"Found unicode {child}") for j in range(byte_count): # Add the unicode key to the removal map once per byte # This ensures we remove the right number of ANSI representation chars removal_map.append(child) new_children.append(child) if return_ascii_map is True: return new_children, ascii_map return new_children
def unicode_escape_to_chr(item: bytes) ‑> str
-
Convert unicode char from it's decimal to its unicode character representation. From "\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents.
Args
item
:str
- A RTF Escape in the format \u[-]NNNNN.
Returns
The unicode character representation of the identified character
Raises
ValueError
- The escaped unicode character is not valid.
Expand source code
def unicode_escape_to_chr(item: bytes) -> str: """Convert unicode char from it's decimal to its unicode character representation. From "\\u[-]NNNNN" to the string representing the character whose Unicode code point that decimal represents. Args: item (str): A RTF Escape in the format \\u[-]NNNNN. Returns: The unicode character representation of the identified character Raises: ValueError: The escaped unicode character is not valid. """ try: prefix = b'\\u' if item.startswith(prefix): nnnn = item[len(prefix):] else: nnnn = item nnnn = int(nnnn) # raises ValueError if not int. except ValueError as _e: raise ValueError(f"`{item}` is not a valid escaped unicode character.") from _e if nnnn < 0: # § -NNNNN is a negative integer expressed in decimal digits ncr = 65536 + nnnn else: # § NNNNN is a positive integer expressed in decimal digits ncr = nnnn # § HHHH is the hexadecimal equivalent of NNNNN or -NNNNN return chr(ncr)
def validate_ansi_cpg(header: str) ‑> None
-
Check an '\ansicpgNNNN' string to see if the number NNNN is an actual codepage.
Args
header
:str
- The value from the lark
\ansicpg
CONTROLWORD Token.
Raises
MalformedRtf
- If the value passed is not a valid ansi codepage.
Expand source code
def validate_ansi_cpg(header: str) -> None: """Check an '\\ansicpgNNNN' string to see if the number NNNN is an actual codepage. Args: header (str): The value from the lark `\\ansicpg` CONTROLWORD Token. Raises: MalformedRtf: If the value passed is not a valid ansi codepage. """ try: possible_cpg_num = int(header.strip()[8:]) check_codepage_num(possible_cpg_num) except ValueError as _e: raise MalformedRtf(f"Unsupported unicode codepage number `{header}` found in the header") from _e
Classes
class TextDecoder (keep_fontdef=False, initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False)
-
keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed. initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the information. use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for.
Expand source code
class TextDecoder: def __init__(self, keep_fontdef=False, initial_byte_count=None, use_ASCII_alternatives_on_unicode_decode_failure=False): """ keep_fontdef: (bool) If False (default), will remove fontdef's from object tree once they are processed. initial_byte_count: (int) The initial Unicode Character Byte Count. Does not need to be set unless you are only providing a RTF snippet which does not contain the RTF header which sets the information. use_ASCII_alternatives_on_unicode_decode_failure: (bool) If we encounter errors when decoding unicode chars we will use the ASCII alternative since that's what they are included for. """ self.keep_fontdef = keep_fontdef self.ucbc = initial_byte_count self.use_ASCII_alternatives_on_unicode_decode_failure = use_ASCII_alternatives_on_unicode_decode_failure # Font table values set set_font_info self.default_font = None self.font_stack = [] self.font_table = {} def set_font_info(self, obj: Tree): """ obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree. """ self.default_font = get_default_font(obj) self.font_stack = [self.default_font] raw_fonttbl = get_font_table(obj.children[1]) self.font_table = parse_font_tree(raw_fonttbl) if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction(f"FONT TABLE FOUND: {raw_fonttbl}") def update_children(self, obj: Tree): """ obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree. """ # Reset font info self.set_font_info(obj) children = obj.children obj.children = [i for i in self.iterate_on_children(children)] def prep_unicode(self, children: List[Token]): if includes_unicode_chars(children): # Clean out all replacement chars # log_text_extraction("Prepping Unicode Chars:" + repr(children)) children, ascii_map = remove_unicode_replacements(children, byte_count=self.ucbc) # print("===\nCHILD:" + repr(children)) # print("===\nASCII:" + repr(ascii_map)) # Merge all surrogate pairs children = merge_surrogate_chars(children, ascii_map, self.use_ASCII_alternatives_on_unicode_decode_failure) # print("FINAL CHILDREN") # log_text_extraction("Replaced Unicode Chars With: " + repr(children)) return children def iterate_on_children(self, children): # Children should be 'List[Union[Token,Tree]]' but lark's Tree typing is defined badly. set_fonts = [] if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction("Starting to iterate on text extraction children...") log_text_extraction("PREP-BEFORE: "+repr(children)) children = self.prep_unicode(children) if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction("PREP-AFTER: "+repr(children)) for item in children: if is_font_number(item): # Font Definitions self.font_stack.append(item.value.strip()) set_fonts.append(item.value) if self.keep_fontdef is True: yield item elif is_unicode_char_byte_count(item): bc = get_unicode_char_byte_count(item) elif is_unicode_encoded(item): # Unicode Chars decoded = unicode_escape_to_chr(item.value).encode() # Convert into STRING token decoded_tok = Token('STRING', decoded, start_pos=item.start_pos, end_pos=item.end_pos, line=item.line, end_line=item.end_line, column=item.column, end_column=item.end_column) if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction(f"UNICODE TOKEN {item}: {decoded_tok}") yield decoded_tok # Decode a hex array elif is_hexarray(item): # print("IS Hex?? {0}".format(item)) base_bytes = None for hexchild in item.children: if base_bytes is None: base_bytes = get_bytes_from_hex_encoded(hexchild.value) else: base_bytes += get_bytes_from_hex_encoded(hexchild.value) current_fontdef = self.font_table[self.font_stack[-1]] current_codec = current_fontdef.codec decoded_hex = decode_hex_char(base_bytes, current_codec) # We are replacing a Tree. So, need item.data to access it's info token decoded_hex_tok = Token('STRING', decoded_hex, start_pos=item.data.start_pos, end_pos=item.data.end_pos, line=item.data.line, end_line=item.data.end_line, column=item.data.column, end_column=item.data.end_column) yield decoded_hex_tok elif isinstance(item, Tree): # Run this same function recursively on nested trees item.children = [i for i in self.iterate_on_children(item.children)] yield item else: yield item for i in set_fonts: # Remove all fonts defined while in this group self.font_stack.pop()
Methods
def iterate_on_children(self, children)
-
Expand source code
def iterate_on_children(self, children): # Children should be 'List[Union[Token,Tree]]' but lark's Tree typing is defined badly. set_fonts = [] if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction("Starting to iterate on text extraction children...") log_text_extraction("PREP-BEFORE: "+repr(children)) children = self.prep_unicode(children) if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction("PREP-AFTER: "+repr(children)) for item in children: if is_font_number(item): # Font Definitions self.font_stack.append(item.value.strip()) set_fonts.append(item.value) if self.keep_fontdef is True: yield item elif is_unicode_char_byte_count(item): bc = get_unicode_char_byte_count(item) elif is_unicode_encoded(item): # Unicode Chars decoded = unicode_escape_to_chr(item.value).encode() # Convert into STRING token decoded_tok = Token('STRING', decoded, start_pos=item.start_pos, end_pos=item.end_pos, line=item.line, end_line=item.end_line, column=item.column, end_column=item.end_column) if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction(f"UNICODE TOKEN {item}: {decoded_tok}") yield decoded_tok # Decode a hex array elif is_hexarray(item): # print("IS Hex?? {0}".format(item)) base_bytes = None for hexchild in item.children: if base_bytes is None: base_bytes = get_bytes_from_hex_encoded(hexchild.value) else: base_bytes += get_bytes_from_hex_encoded(hexchild.value) current_fontdef = self.font_table[self.font_stack[-1]] current_codec = current_fontdef.codec decoded_hex = decode_hex_char(base_bytes, current_codec) # We are replacing a Tree. So, need item.data to access it's info token decoded_hex_tok = Token('STRING', decoded_hex, start_pos=item.data.start_pos, end_pos=item.data.end_pos, line=item.data.line, end_line=item.data.end_line, column=item.data.column, end_column=item.data.end_column) yield decoded_hex_tok elif isinstance(item, Tree): # Run this same function recursively on nested trees item.children = [i for i in self.iterate_on_children(item.children)] yield item else: yield item for i in set_fonts: # Remove all fonts defined while in this group self.font_stack.pop()
def prep_unicode(self, children: List[lark.lexer.Token])
-
Expand source code
def prep_unicode(self, children: List[Token]): if includes_unicode_chars(children): # Clean out all replacement chars # log_text_extraction("Prepping Unicode Chars:" + repr(children)) children, ascii_map = remove_unicode_replacements(children, byte_count=self.ucbc) # print("===\nCHILD:" + repr(children)) # print("===\nASCII:" + repr(ascii_map)) # Merge all surrogate pairs children = merge_surrogate_chars(children, ascii_map, self.use_ASCII_alternatives_on_unicode_decode_failure) # print("FINAL CHILDREN") # log_text_extraction("Replaced Unicode Chars With: " + repr(children)) return children
def set_font_info(self, obj: lark.tree.Tree)
-
obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
Expand source code
def set_font_info(self, obj: Tree): """ obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree. """ self.default_font = get_default_font(obj) self.font_stack = [self.default_font] raw_fonttbl = get_font_table(obj.children[1]) self.font_table = parse_font_tree(raw_fonttbl) if is_logger_on("RTFDE.text_extraction") is True: log_text_extraction(f"FONT TABLE FOUND: {raw_fonttbl}")
def update_children(self, obj: lark.tree.Tree)
-
obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree.
Expand source code
def update_children(self, obj: Tree): """ obj (Tree): A lark Tree object. Should be the DeEncapsulator.full_tree. """ # Reset font info self.set_font_info(obj) children = obj.children obj.children = [i for i in self.iterate_on_children(children)]
class fontdef (fnum, codepage, codec, fontdef_tree)
-
fontdef(fnum, codepage, codec, fontdef_tree)
Ancestors
- builtins.tuple
Instance variables
var codec
-
Alias for field number 2
var codepage
-
Alias for field number 1
var fnum
-
Alias for field number 0
var fontdef_tree
-
Alias for field number 3