Module RTFDE.utils

Expand source code
# -*- coding: utf-8 -*-
#
# This file is part of package name, a package description short.
# Copyright © 2022 seamus tuohy, <code@seamustuohy.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.

import difflib
import sys
import re
from typing import Union, AnyStr, Any
#  from Python 3.9 typing.Generator is deprecated in favour of collections.abc.Generator
from collections.abc import Generator
from lark.lexer import Token
from lark.tree import Tree
from lark import Lark


import logging
log = logging.getLogger("RTFDE")

def get_control_parameter_as_hex_strings(control_parameter: Union[str,int]) -> str:
    """Returns the hex encoded value of a .rtf control parameter.

Args:
    control_parameter: (int/str) Int or a string which represents an int.

Returns:
    Zero padded 6 char long hexedecimal string.
"""
    try:
        return f"{control_parameter:#06x}"
    except ValueError:
        # If passed as string convert first
        control_parameter = int(control_parameter)
        return f"{control_parameter:#06x}"

def print_to_tmp_file(data: Union[AnyStr,bytes,bytearray], path: str):
    """Prints binary object to a dump file for quick debugging.

Warning: Not for normal use. Only use when debugging.

Args:
    data (bytes|str): Data to write to path
    path (str): The file path to write data to
    """
    # Be able to print binary objects easily
    if isinstance(data, (bytes, bytearray)) is True:
        open_as = 'wb+'
    else:
        open_as = 'w+'
    with open(path, open_as) as fp:
        original_stdout = sys.stdout
        sys.stdout = fp
        print(data)
        sys.stdout = original_stdout

def encode_escaped_control_chars(raw_text: bytes) -> bytes:
    """Replaces escaped control chars within the text with their RTF encoded versions \\'HH.

Args:
    raw_text (str): string which needs escape characters encoded

Returns:
    A string with escaped control chars
    """
    cleaned = raw_text.replace(b'\\\\', b"\\'5c")
    cleaned = cleaned.replace(b'\\{', b"\\'7b")
    cleaned = cleaned.replace(b'\\}', b"\\'7d")
    return cleaned

def is_codeword_with_numeric_arg(token: Union[Token,Any], codeword: bytes) -> bool:
    """Checks if a Token is a codeword with a numeric argument.

Returns:
    True if a Token is a codeword with a numeric argument. False if not.
"""
    try:
        val = token.value.strip()
        # print(val, codeword)
        if (val.startswith(codeword) and
            val[len(codeword):].isdigit()):
            return True
    except AttributeError:
        return False
    return False

def print_lark_parser_evaluated_grammar(parser):
    """Prints the final evaluated grammar.

Can be useful for debugging possible errors in grammar evaluation.

Args:
    parser (Lark obj): Lark object to extract grammar from.
    """
    if not isinstance(parser, Lark):
        raise ValueError("Requires a Lark object.")
    eq = "="*15
    eq = " " + eq + " "
    print(eq + "RULES" + eq + "\n")
    for i in parser.rules:
        print("    " + i)
    print(eq + "TERMINALS" + eq + "\n")
    for i in parser.terminals:
        print("    " + i)
    print(eq + "IGNORED TOKENS" + eq + "\n")
    for i in parser.ignore_tokens:
        print("    " + i)

def log_validators(data):
    """Log validator logging only if RTFDE.validation_logger set to debug.
    """
    logger = logging.getLogger("RTFDE.validation_logger")
    if logger.level == logging.DEBUG:
        logger.debug(data)

def log_transformations(data):
    """Log transform logging only if RTFDE.transform_logger set to debug.
    """
    logger = logging.getLogger("RTFDE.transform_logger")
    if logger.level == logging.DEBUG:
        logger.debug(data)

def is_logger_on(logger_name, level=logging.DEBUG):
    """Check if a logger is enabled and on debug.
    """
    logger = logging.getLogger(logger_name)
    if logger.level == level:
        return True
    return False

def log_text_extraction(data):
    """Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug.
    """
    logger = logging.getLogger("RTFDE.text_extraction")
    if logger.level == logging.DEBUG:
        logger.debug(data)

def log_htmlrtf_stripping(data: Token):
    """Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug.

Raises:
    AttributeError: Will occur if you pass this something that is not a token.
"""
    logger = logging.getLogger("RTFDE.HTMLRTF_Stripping_logger")
    if logger.level == logging.DEBUG:
        if not isinstance(data, Token):
            raise AttributeError("HTMLRTF Stripping logger only logs Tokens")
        tok_desc = "HTMLRTF Removed: {value}, {line}, {end_line}, {start_pos}, {end_pos}"
        log_msg = tok_desc.format(value=data.value,
                                  line=data.line,
                                  end_line=data.end_line,
                                  start_pos=data.start_pos,
                                  end_pos = data.end_pos)
        logger.debug(log_msg)

def log_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
    """Log diff of two strings. Defaults to splitting by newlines and keeping the ends.

Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging.

Args:
    original: The original string
    revised: The changed version of the string
    sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
"""
    log.debug(get_string_diff(original, revised, sep))

def get_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
    """Get the diff of two strings. Defaults to splitting by newlines and keeping the ends.

Args:
    original: The original string
    revised: The changed version of the string
    sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.

Returns:
    A string object representing the diff of the two strings provided.
"""
    if sep is None:
        orig_split = original.decode().splitlines(keepends=True)
        revised_split = revised.decode().splitlines(keepends=True)
    else:
        original = original.replace(b'\n',b'')
        revised = revised.replace(b'\n',b'')
        orig_split = [i.decode() for i in re.split(sep, original) if i != b'']
        revised_split = [i.decode() for i in re.split(sep, revised) if i != b'']
    return "\n".join(list(difflib.context_diff(orig_split,
                                               revised_split)))

def get_tree_diff(original: Tree, revised: Tree):
    """Get the diff of two trees.

Args:
    original (lark Tree): A lark tree before transformation
    revised (lark Tree): A lark tree after transformation

Returns:
    A string object representing the diff of the two Trees provided.

Example:
    rtf_obj = DeEncapsulator(raw_rtf)
    rtf_obj.deencapsulate()
    transformed_tree = SomeTransformer.transform(rtf_obj.full_tree)
    get_tree_diff(rtf_obj.full_tree, transformed_tree)

    """
    log = logging.getLogger("RTFDE")
    flat_original  = list(flatten_tree(original))
    flat_revised  = list(flatten_tree(revised))
    return "\n".join(list(difflib.context_diff(flat_original,
                                               flat_revised)))
def flatten_tree(tree: Tree) -> Generator:
    """Flatten a lark Tree into a list of repr's of tree objects.

Args:
    tree (lark Tree): A lark tree
"""
    yield f"Tree('{tree.data}')"
    for child in tree.children:
        if isinstance(child, Token):
            yield repr(child)
        elif isinstance(child, Tree):
            for i in flatten_tree(child):
                yield i
        else:
            yield repr(child)

def flatten_tree_to_string_array(tree: Tree) -> Generator:
    """Flatten a lark Tree into a list of repr's of tree objects.

Args:
    tree (lark Tree): A lark tree
"""
    for child in tree.children:
        if isinstance(child, Tree):
            for i in flatten_tree_to_string_array(child):
                yield i
        elif isinstance(child, Token):
            yield child.value
        else:
            yield child

def make_token_replacement(ttype, value, example):
    if isinstance(example, Token):
        fake_tok = Token(ttype,
                        value,
                        start_pos=example.start_pos,
                        end_pos=example.end_pos,
                        line=example.line,
                        end_line=example.end_line,
                        column=example.column,
                        end_column=example.end_column)
    elif isinstance(example, Tree):
        fake_tok = Token(ttype,
                         value,
                         start_pos=example.meta.start_pos,
                         end_pos=example.meta.end_pos,
                         line=example.meta.line,
                         end_line=example.meta.end_line,
                         column=example.meta.column,
                         end_column=example.meta.end_column)

    return fake_tok


def embed():
    import os
    import readline
    import rlcompleter
    import code
    import inspect
    import traceback

    history = os.path.join(os.path.expanduser('~'), '.python_history')
    if os.path.isfile(history):
        readline.read_history_file(history)

    frame = inspect.currentframe().f_back
    namespace = frame.f_locals.copy()
    namespace.update(frame.f_globals)

    readline.set_completer(rlcompleter.Completer(namespace).complete)
    readline.parse_and_bind("tab: complete")

    file = frame.f_code.co_filename
    line = frame.f_lineno
    function = frame.f_code.co_name

    stack = ''.join(traceback.format_stack()[:-1])
    print(stack)
    banner = f" [ {os.path.basename(file)}:{line} in {function}() ]"
    banner += "\n Entering interactive mode (Ctrl-D to exit) ..."
    try:
        code.interact(banner=banner, local=namespace)
    finally:
        readline.write_history_file(history)

Functions

def embed()
Expand source code
def embed():
    import os
    import readline
    import rlcompleter
    import code
    import inspect
    import traceback

    history = os.path.join(os.path.expanduser('~'), '.python_history')
    if os.path.isfile(history):
        readline.read_history_file(history)

    frame = inspect.currentframe().f_back
    namespace = frame.f_locals.copy()
    namespace.update(frame.f_globals)

    readline.set_completer(rlcompleter.Completer(namespace).complete)
    readline.parse_and_bind("tab: complete")

    file = frame.f_code.co_filename
    line = frame.f_lineno
    function = frame.f_code.co_name

    stack = ''.join(traceback.format_stack()[:-1])
    print(stack)
    banner = f" [ {os.path.basename(file)}:{line} in {function}() ]"
    banner += "\n Entering interactive mode (Ctrl-D to exit) ..."
    try:
        code.interact(banner=banner, local=namespace)
    finally:
        readline.write_history_file(history)
def encode_escaped_control_chars(raw_text: bytes) ‑> bytes

Replaces escaped control chars within the text with their RTF encoded versions 'HH.

Args

raw_text : str
string which needs escape characters encoded

Returns

A string with escaped control chars

Expand source code
def encode_escaped_control_chars(raw_text: bytes) -> bytes:
    """Replaces escaped control chars within the text with their RTF encoded versions \\'HH.

Args:
    raw_text (str): string which needs escape characters encoded

Returns:
    A string with escaped control chars
    """
    cleaned = raw_text.replace(b'\\\\', b"\\'5c")
    cleaned = cleaned.replace(b'\\{', b"\\'7b")
    cleaned = cleaned.replace(b'\\}', b"\\'7d")
    return cleaned
def flatten_tree(tree: lark.tree.Tree) ‑> collections.abc.Generator

Flatten a lark Tree into a list of repr's of tree objects.

Args

tree : lark Tree
A lark tree
Expand source code
def flatten_tree(tree: Tree) -> Generator:
    """Flatten a lark Tree into a list of repr's of tree objects.

Args:
    tree (lark Tree): A lark tree
"""
    yield f"Tree('{tree.data}')"
    for child in tree.children:
        if isinstance(child, Token):
            yield repr(child)
        elif isinstance(child, Tree):
            for i in flatten_tree(child):
                yield i
        else:
            yield repr(child)
def flatten_tree_to_string_array(tree: lark.tree.Tree) ‑> collections.abc.Generator

Flatten a lark Tree into a list of repr's of tree objects.

Args

tree : lark Tree
A lark tree
Expand source code
def flatten_tree_to_string_array(tree: Tree) -> Generator:
    """Flatten a lark Tree into a list of repr's of tree objects.

Args:
    tree (lark Tree): A lark tree
"""
    for child in tree.children:
        if isinstance(child, Tree):
            for i in flatten_tree_to_string_array(child):
                yield i
        elif isinstance(child, Token):
            yield child.value
        else:
            yield child
def get_control_parameter_as_hex_strings(control_parameter: Union[str, int]) ‑> str

Returns the hex encoded value of a .rtf control parameter.

Args

control_parameter
(int/str) Int or a string which represents an int.

Returns

Zero padded 6 char long hexedecimal string.

Expand source code
def get_control_parameter_as_hex_strings(control_parameter: Union[str,int]) -> str:
    """Returns the hex encoded value of a .rtf control parameter.

Args:
    control_parameter: (int/str) Int or a string which represents an int.

Returns:
    Zero padded 6 char long hexedecimal string.
"""
    try:
        return f"{control_parameter:#06x}"
    except ValueError:
        # If passed as string convert first
        control_parameter = int(control_parameter)
        return f"{control_parameter:#06x}"
def get_string_diff(original: bytes, revised: bytes, sep: Optional[bytes] = None)

Get the diff of two strings. Defaults to splitting by newlines and keeping the ends.

Args

original
The original string
revised
The changed version of the string
sep : string
A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.

Returns

A string object representing the diff of the two strings provided.

Expand source code
def get_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
    """Get the diff of two strings. Defaults to splitting by newlines and keeping the ends.

Args:
    original: The original string
    revised: The changed version of the string
    sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.

Returns:
    A string object representing the diff of the two strings provided.
"""
    if sep is None:
        orig_split = original.decode().splitlines(keepends=True)
        revised_split = revised.decode().splitlines(keepends=True)
    else:
        original = original.replace(b'\n',b'')
        revised = revised.replace(b'\n',b'')
        orig_split = [i.decode() for i in re.split(sep, original) if i != b'']
        revised_split = [i.decode() for i in re.split(sep, revised) if i != b'']
    return "\n".join(list(difflib.context_diff(orig_split,
                                               revised_split)))
def get_tree_diff(original: lark.tree.Tree, revised: lark.tree.Tree)

Get the diff of two trees.

Args

original : lark Tree
A lark tree before transformation
revised : lark Tree
A lark tree after transformation

Returns

A string object representing the diff of the two Trees provided.

Example

rtf_obj = DeEncapsulator(raw_rtf) rtf_obj.deencapsulate() transformed_tree = SomeTransformer.transform(rtf_obj.full_tree) get_tree_diff(rtf_obj.full_tree, transformed_tree)

Expand source code
def get_tree_diff(original: Tree, revised: Tree):
    """Get the diff of two trees.

Args:
    original (lark Tree): A lark tree before transformation
    revised (lark Tree): A lark tree after transformation

Returns:
    A string object representing the diff of the two Trees provided.

Example:
    rtf_obj = DeEncapsulator(raw_rtf)
    rtf_obj.deencapsulate()
    transformed_tree = SomeTransformer.transform(rtf_obj.full_tree)
    get_tree_diff(rtf_obj.full_tree, transformed_tree)

    """
    log = logging.getLogger("RTFDE")
    flat_original  = list(flatten_tree(original))
    flat_revised  = list(flatten_tree(revised))
    return "\n".join(list(difflib.context_diff(flat_original,
                                               flat_revised)))
def is_codeword_with_numeric_arg(token: Union[lark.lexer.Token, Any], codeword: bytes) ‑> bool

Checks if a Token is a codeword with a numeric argument.

Returns

True if a Token is a codeword with a numeric argument. False if not.

Expand source code
def is_codeword_with_numeric_arg(token: Union[Token,Any], codeword: bytes) -> bool:
    """Checks if a Token is a codeword with a numeric argument.

Returns:
    True if a Token is a codeword with a numeric argument. False if not.
"""
    try:
        val = token.value.strip()
        # print(val, codeword)
        if (val.startswith(codeword) and
            val[len(codeword):].isdigit()):
            return True
    except AttributeError:
        return False
    return False
def is_logger_on(logger_name, level=10)

Check if a logger is enabled and on debug.

Expand source code
def is_logger_on(logger_name, level=logging.DEBUG):
    """Check if a logger is enabled and on debug.
    """
    logger = logging.getLogger(logger_name)
    if logger.level == level:
        return True
    return False
def log_htmlrtf_stripping(data: lark.lexer.Token)

Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug.

Raises

AttributeError
Will occur if you pass this something that is not a token.
Expand source code
def log_htmlrtf_stripping(data: Token):
    """Log HTMLRTF Stripping logging only if RTFDE.HTMLRTF_Stripping_logger set to debug.

Raises:
    AttributeError: Will occur if you pass this something that is not a token.
"""
    logger = logging.getLogger("RTFDE.HTMLRTF_Stripping_logger")
    if logger.level == logging.DEBUG:
        if not isinstance(data, Token):
            raise AttributeError("HTMLRTF Stripping logger only logs Tokens")
        tok_desc = "HTMLRTF Removed: {value}, {line}, {end_line}, {start_pos}, {end_pos}"
        log_msg = tok_desc.format(value=data.value,
                                  line=data.line,
                                  end_line=data.end_line,
                                  start_pos=data.start_pos,
                                  end_pos = data.end_pos)
        logger.debug(log_msg)
def log_string_diff(original: bytes, revised: bytes, sep: Optional[bytes] = None)

Log diff of two strings. Defaults to splitting by newlines and keeping the ends.

Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging.

Args

original
The original string
revised
The changed version of the string
sep : string
A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
Expand source code
def log_string_diff(original: bytes, revised: bytes, sep: Union[bytes,None] = None):
    """Log diff of two strings. Defaults to splitting by newlines and keeping the ends.

Logs the result in the main RTFDE logger as a debug log. Warning: Only use when debugging as this is too verbose to be used in regular logging.

Args:
    original: The original string
    revised: The changed version of the string
    sep (string): A pattern to split the string by. Uses re.split under the hood. NOTE: Deletes all empty strings before diffing to make the diff more concise.
"""
    log.debug(get_string_diff(original, revised, sep))
def log_text_extraction(data)

Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug.

Expand source code
def log_text_extraction(data):
    """Log additional text decoding/encoding logging only if RTFDE.text_extraction set to debug.
    """
    logger = logging.getLogger("RTFDE.text_extraction")
    if logger.level == logging.DEBUG:
        logger.debug(data)
def log_transformations(data)

Log transform logging only if RTFDE.transform_logger set to debug.

Expand source code
def log_transformations(data):
    """Log transform logging only if RTFDE.transform_logger set to debug.
    """
    logger = logging.getLogger("RTFDE.transform_logger")
    if logger.level == logging.DEBUG:
        logger.debug(data)
def log_validators(data)

Log validator logging only if RTFDE.validation_logger set to debug.

Expand source code
def log_validators(data):
    """Log validator logging only if RTFDE.validation_logger set to debug.
    """
    logger = logging.getLogger("RTFDE.validation_logger")
    if logger.level == logging.DEBUG:
        logger.debug(data)
def make_token_replacement(ttype, value, example)
Expand source code
def make_token_replacement(ttype, value, example):
    if isinstance(example, Token):
        fake_tok = Token(ttype,
                        value,
                        start_pos=example.start_pos,
                        end_pos=example.end_pos,
                        line=example.line,
                        end_line=example.end_line,
                        column=example.column,
                        end_column=example.end_column)
    elif isinstance(example, Tree):
        fake_tok = Token(ttype,
                         value,
                         start_pos=example.meta.start_pos,
                         end_pos=example.meta.end_pos,
                         line=example.meta.line,
                         end_line=example.meta.end_line,
                         column=example.meta.column,
                         end_column=example.meta.end_column)

    return fake_tok
def print_lark_parser_evaluated_grammar(parser)

Prints the final evaluated grammar.

Can be useful for debugging possible errors in grammar evaluation.

Args

parser : Lark obj
Lark object to extract grammar from.
Expand source code
def print_lark_parser_evaluated_grammar(parser):
    """Prints the final evaluated grammar.

Can be useful for debugging possible errors in grammar evaluation.

Args:
    parser (Lark obj): Lark object to extract grammar from.
    """
    if not isinstance(parser, Lark):
        raise ValueError("Requires a Lark object.")
    eq = "="*15
    eq = " " + eq + " "
    print(eq + "RULES" + eq + "\n")
    for i in parser.rules:
        print("    " + i)
    print(eq + "TERMINALS" + eq + "\n")
    for i in parser.terminals:
        print("    " + i)
    print(eq + "IGNORED TOKENS" + eq + "\n")
    for i in parser.ignore_tokens:
        print("    " + i)
def print_to_tmp_file(data: Union[~AnyStr, bytes, bytearray], path: str)

Prints binary object to a dump file for quick debugging.

Warning: Not for normal use. Only use when debugging.

Args

data (bytes|str): Data to write to path
path : str
The file path to write data to
Expand source code
def print_to_tmp_file(data: Union[AnyStr,bytes,bytearray], path: str):
    """Prints binary object to a dump file for quick debugging.

Warning: Not for normal use. Only use when debugging.

Args:
    data (bytes|str): Data to write to path
    path (str): The file path to write data to
    """
    # Be able to print binary objects easily
    if isinstance(data, (bytes, bytearray)) is True:
        open_as = 'wb+'
    else:
        open_as = 'w+'
    with open(path, open_as) as fp:
        original_stdout = sys.stdout
        sys.stdout = fp
        print(data)
        sys.stdout = original_stdout