Module RTFDE.deencapsulate

Expand source code
# -*- coding: utf-8 -*-
#
# This file is part of RTFDE, a RTF De-Encapsulator.
# Copyright © 2020 seamus tuohy, <code@seamustuohy.com>
#
# This program is free software: you can redistribute it and/or modify it
# under the terms of the GNU Lesser General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful, but WITHOUT
# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the included LICENSE file for details.

from typing import Union, AnyStr, Tuple, Dict, Any
from io import BufferedReader

from lark import Lark
from lark.tree import Tree
from lark.lexer import Token
from lark.exceptions import UnexpectedInput

from RTFDE.transformers import RTFCleaner, StripControlWords
from RTFDE.transformers import StripNonVisibleRTFGroups
from RTFDE.transformers import StripUnusedSpecialCharacters
from RTFDE.utils import encode_escaped_control_chars
from RTFDE.utils import log_validators, log_transformations, is_logger_on
from RTFDE.transformers import get_stripped_HTMLRTF_values, DeleteTokensFromTree, strip_binary_objects
from RTFDE.grammar import make_concise_grammar
from RTFDE.text_extraction import TextDecoder
from RTFDE.text_extraction import validate_ansi_cpg

# For catching exceptions
from RTFDE.exceptions import NotEncapsulatedRtf, MalformedEncapsulatedRtf, MalformedRtf

import logging
log = logging.getLogger("RTFDE")

class DeEncapsulator():
    """De-Encapsulating RTF converter of HTML/TEXT found in .msg files.

De-encapsulation enables previously encapsulated HTML and plain text content to be extracted and rendered as HTML and plain text instead of the encapsulating RTF content. After de-encapsulation, the HTML and plain text should differ only minimally from the original HTML or plain text content.


Parameters:
    raw_rtf: (bytes): It's the raw RTF file as bytes.
    grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.

Attributes:
    content: (bytes) The deencapsulated content no matter what format it is in. Populated by the `deencapsulate` function.
    html: (bytes) The deencapsulated content IF it is HTML content. Populated by the `set_content` function.
    text: (bytes) The deencapsulated content IF it is plain text content. Populated by the `set_content` function.
    found_binary: List of dictionaries containing binary data extracted from the rtf file.
    content_type: The type of content encapsulated in .rtf data (html or text). Populated by the `get_content_type` function.
    full_tree: The full .rtf object parsed into an object Tree using the grammar. Populated by the `parse_rtf` function.
    doc_tree: The `document` portion of the .rtf full_tree object.
    raw_rtf: The raw encapsulated .rtf data in byte format.
    grammar: The Lark parsing grammer used to parse the .rtf data.
    content_type_token: The .rtf header token identifying the content type. (\\fromhtml1 OR \\fromtext)
    parser: The lark parser. Should not need to be manipulated directly. But, useful for debugging and saving the parsed object.
    """

    def __init__(self, raw_rtf:bytes, grammar: Union[str,None] = None):
        """Load in the Encapsulated test and setup the grammar used to parse the encapsulated RTF.

NOTE: This does not do the parsing in the init so that you can initiate the object and do the parsing step by step.

Parameters:
        raw_rtf: (bytes): It's the raw RTF string.
        grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.

Raises:
        TypeError: The raw_rtf data passed is not the correct type of data (string/byte string).
"""
        self.content: str
        self.content_type: str
        self.content_type_token: str
        self.parser: Any

        self.html: str
        self.text: str
        self.found_binary: list
        self.full_tree: Tree
        self.doc_tree: Tree
        self.catch_common_validation_issues(raw_rtf)
        if isinstance(raw_rtf, bytes):
            raw_rtf_bytes = raw_rtf
        else:
            raise TypeError("DeEncapssulator only accepts RTF files in string or byte-string formats")
        raw_rtf_bytes = raw_rtf_bytes.rstrip(b'\x00')
        raw_rtf_bytes = raw_rtf_bytes.replace(b'\r\n',b'\n')
        raw_rtf_bytes = raw_rtf_bytes.replace(b'\r',b'\n')
        self.raw_rtf: bytes = raw_rtf_bytes
        if grammar is not None:
            self.grammar: str = grammar
        else:
            self.grammar = make_concise_grammar()

    def deencapsulate(self):
        """De-encapsulate the RTF content loaded into the De-Encapsulator.

Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The `content` property will store the content no matter what format it is in. The `html` and `text` properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.)
        """
        stripped_data = strip_binary_objects(self.raw_rtf)
        non_binary_rtf = stripped_data[0]
        found_binary = stripped_data[1]
        if len(found_binary) > 0:
            self.found_binary = found_binary
            log.info("Binary data found and extracted from rtf file.")
        escaped_rtf = encode_escaped_control_chars(non_binary_rtf)
        if is_logger_on("RTFDE.transform_logger") is True:
            log_transformations(escaped_rtf)
        try:
            self.parse_rtf(escaped_rtf)
        except UnexpectedInput as _e:
            raise MalformedEncapsulatedRtf(f"Malformed encapsulated RTF discovered:") from _e
        Decoder = TextDecoder()
        Decoder.update_children(self.full_tree)
        self.get_doc_tree()
        self.validate_encapsulation()

        # remove htmlrtf escaped values
        htmlrtf_stripped = self.strip_htmlrtf_tokens()
        # Strips whitespace from control words
        control_stripped = StripControlWords().transform(htmlrtf_stripped)
        # Strip unused control chars
        special_stripper = StripUnusedSpecialCharacters()
        non_special_tree = special_stripper.transform(control_stripped)
        # Strip out non-visible RTF groups
        stripper = StripNonVisibleRTFGroups()
        stripped_tree = stripper.transform(non_special_tree)
        # Converts any remaining tokens
        cleaner = RTFCleaner(visit_tokens=True)
        cleaned_text = cleaner.transform(stripped_tree)

        self.content = cleaned_text
        self.set_content() # Populates self.html || self.text

    def validate_charset(self, fallback_to_default:bool =False) -> bytes:
        """Validate and return the RTF charset keyword from the RTF streams header.

Args:
        fallback_to_default (bool): Allows you to force the use of the default charset "\\ansi" if one is not found.

Raises:
        MalformedRtf: RTF stream does not include charset control word.

Returns:
        The RTF charset keyword from the RTF streams header.
"""
        main_headers = self.get_header_control_words_before_first_group()

        for token in main_headers:
            if token.value in [b'\\ansi', b'\\mac', b'\\pc', b'\\pca']:
                return token

        log.debug("Acceptable charset not found as the second token in the RTF stream. The control word for the character set must precede any plain text or any table control words. So, if this stream doesn't have one it is malformed or corrupted.")
        if fallback_to_default is False:
            raise MalformedRtf("RTF stream does not include charset control word.")

        log.warning("The fallback_to_default option on _get_charset is considered DANGEROUS if used on possibly malicious samples. Make sure you know what you are doing before using it.")
        log.info("Attempting to decode RTF using the default charset ansi. This is not recommended and could have unforeseen consequences for the resulting file and your systems security.")
        log.debug("You have a malformed RTF stream. Are you sure you really want to be parsing it? It might not just be corrupted. It could be maliciously constructed.")
        return b"\\ansi"

    def set_content(self):
        """Populate the html or text content based on the content type. Populates self.html and/or self.text variables."""
        self.content_type = self.get_content_type()
        if self.content_type == 'html':
            self.html = self.content
        else:
            self.text = self.content

    def get_doc_tree(self):
        """Extract the document portion of the .rtf full_tree object. Populates the classes doc_tree attribute.

Raises:
        ValueError: The .rtf document object is missing or mis-located in the .rtf's full_tree object.
"""
        if self.full_tree.children[1].data == "document":
            self.doc_tree = self.full_tree.children[1]
        else:
            raise ValueError("Document object in the wrong place after parsing.")

    def get_content_type(self):
        """Provide the type of content encapsulated in RTF.

NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data.

Raises:
        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
"""
        if self.content_type_token is None:
            self.validate_FROM_in_doc_header()
        elif self.content_type_token == b'\\fromhtml1':
            return 'html'
        elif self.content_type_token == b'\\fromtext':
            return "text"

        raise NotEncapsulatedRtf("Data is missing encapsulated content type header (the FROM header).")

    def validate_encapsulation(self):
        """Runs simple tests to validate that the file in question is an rtf document which contains encapsulation."""
        self.validate_rtf_doc_header(self.doc_tree)
        self.validate_charset()
        self.validate_FROM_in_doc_header()
        ansicpg = self.get_ansicpg_header()
        if ansicpg is not None: # ansicpg is not manditory
            validate_ansi_cpg(ansicpg.value)

    def get_ansicpg_header(self) -> Union[Token,None]:
        """Extract the ansicpg control word from the .rtf header.

Returns:
        A lark CONTROLWORD Token with the `\\ansicpg` value. Returns None if the `\\ansicpg` control word is not included as this is only required if there is Unicode which needs to be converted to ANSI within a .rtf file.
"""
        headers = self.get_header_control_words_before_first_group()
        for item in headers:
            if item.value.startswith(b'\\ansicpg'):
                return item
        return None

    def parse_rtf(self, rtf: str):
        """Parse RTF file's header and document and extract the objects within the RTF into a Tree. Populates the self.full_tree attribute.

Args:
        rtf: The .rtf string to parse with the projects lark grammar.
"""
        # Uncomment Lark debug argument if you want to enable logging.
        # Note, this not enable ALL lark debug logging.
        # To do that we would not be able to use the Lark convinence class which we are using here.
        self.parser = Lark(self.grammar,
                           parser='lalr',
                           keep_all_tokens=True,
                           use_bytes=True,
                           # debug=True,
                           propagate_positions=True)
        self.full_tree = self.parser.parse(rtf)
        if is_logger_on("RTFDE.transform_logger") is True:
            log_transformations(self.full_tree)


    def strip_htmlrtf_tokens(self) -> Tree:
        """Strip tokens from with htmlrtf regions of the doc_tree as they were not part of the original HTML content.

Returns:
        .rtf doc_tree stripped of all non-original tokens.
"""
        # remove htmlrtf escaped values
        delete_generator = get_stripped_HTMLRTF_values(self.doc_tree)
        tokens_to_delete = list(delete_generator)
        deleter = DeleteTokensFromTree(tokens_to_delete)
        htmlrtf_cleaned_tree = deleter.transform(self.doc_tree)
        return htmlrtf_cleaned_tree


    def get_header_control_words_before_first_group(self) -> list:
        """Extracts all the control words in the first 20 tokens of the document or all the tokens which occur before the first group (whichever comes first.)

This is used to extract initial header values for validation functions.

Returns:
        A list containing the header tokens in the .rtf data.
        """
        initial_control_words = []
        for token in self.doc_tree.children[:20]:
            if isinstance(token, Token):
                initial_control_words.append(token)
            else:
                return initial_control_words
        return initial_control_words


    def validate_FROM_in_doc_header(self):
        """Inspect the header to identify what type of content (html/plain text) is encapsulated within the document.

NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX

Raises:
        MalformedEncapsulatedRtf: The .rtf headers are malformed.
        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
        """
        cw_found = {"rtf1":False,
                    "from":False,
                    "fonttbl":False,
                    "malformed":False}
        # The de-encapsulating RTF reader SHOULD inspect no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. This means more than just control words.
        decoded_tree = StripControlWords().transform(self.doc_tree)
        first_ten_tokens = decoded_tree.children[:10]
        operating_tokens = []
        found_token = None
        for token in first_ten_tokens:
            if isinstance(token, Token):
                operating_tokens.append(token)
            else:
                operating_tokens += list(token.scan_values(lambda t: t.type == 'CONTROLWORD'))
        if is_logger_on("RTFDE.validation_logger") is True:
            log_validators(f"Header tokens being evaluated: {operating_tokens}")

        for token in operating_tokens:
            cw_found,found_token = self.check_from_token(token=token, cw_found=cw_found)
            if cw_found['from'] is True and cw_found["malformed"] is True:
                raise MalformedEncapsulatedRtf("RTF file looks like is was supposed to be encapsulated HTML/TEXT but the headers are malformed. Turn on debugging to see specific information")
            # Save content type token available for id-ing type of content later
            if found_token is not None:
                self.content_type_token = found_token

        if cw_found['from'] is False:
            log.debug("FROMHTML/TEXT control word not found in first 10 RTF tokens. This is not an HTML/TEXT encapsulated RTF document.")
            raise NotEncapsulatedRtf("FROMHTML/TEXT control word not found.")

    @staticmethod
    def check_from_token(token:Token, cw_found:dict) -> Tuple[Dict,Union[None,str]] :
        """Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the validate_FROM_in_doc_header function.

Args:
        token: The token to check for in the cw_found state dictionary.
        cw_found: The state dictionary which is used to track the position of the from token within the header.

        `cw_found = {"rtf1":<BOOL>, "from":<BOOL>, "fonttbl":<BOOL>, "malformed":<BOOL>}`


Returns:
        cw_found: Updated state dictionary
        found_token: The content_type_token found in the header.

        """
        from_cws = [b'\\fromhtml1', b'\\fromtext']
        # This control word MUST appear before the \fonttbl control word and after the \rtf1 control word, as specified in [MSFT-RTF].
        rtf1_cw = b"\\rtf1"
        found_token = None
        fonttbl_cw = b"\\fonttbl"
        if token.type == "CONTROLWORD":
            if token.value.strip() in from_cws:
                if cw_found['from'] is True:
                    cw_found["malformed"] = True
                    log.debug("Multiple FROM HTML/TXT tokens found in the header. This encapsulated RTF is malformed.")
                if cw_found['rtf1'] is True:
                    cw_found['from'] = True
                    found_token = token.value
                else:
                    log.debug("FROMHTML/TEXT control word found before rtf1 control word. That's not allowed in the RTF spec.")
                    cw_found['from'] = True
                    cw_found["malformed"] = True
            elif token.value.strip() == rtf1_cw:
                cw_found['rtf1'] = True
            elif token.value.strip() == fonttbl_cw:
                cw_found['fonttbl'] = True
                if cw_found['from'] is not True:
                    log.debug("\\fonttbl code word found before FROMTML/TEXT was defined. This is not allowed for encapsulated HTML/TEXT. So... this is not encapsulated HTML/TEXT or it was badly encapsulated.")
                    cw_found["malformed"] = True
        return cw_found, found_token


    @staticmethod
    def validate_rtf_doc_header(doc_tree: Tree):
        """Check if doc starts with a valid RTF header `\\rtf1`.

        "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\\rtf1")." - MS-OXRTFEX

Raises:
        MalformedRtf: The .rtf headers do not include \\rtf1.
"""
        first_token = doc_tree.children[0].value
        if first_token != b"\\rtf1":
            log.debug("RTF stream does not contain valid valid RTF document heading. The file must start with \"{\\rtf1\"")
            if is_logger_on("RTFDE.validation_logger") is True:
                log_validators(f"First child object in document tree is: {first_token!r}")
            raise MalformedRtf("RTF stream does not start with {\\rtf1")

    @staticmethod
    def catch_common_validation_issues(raw_rtf: AnyStr):
        """Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them.

Args:
        raw_rtf: A raw .rtf string or byte-string.

Raises:
        TypeError: The data passed is the wrong type of data.
        MalformedRtf: The data passed is not a correctly formatted .rtf string.
"""
        if isinstance(raw_rtf, BufferedReader):
            raise TypeError("Data passed as file pointer. DeEncapsulator only accepts byte objects.")
        if raw_rtf is None:
            raise TypeError("Data passed as raw RTF file is a null object `None` keyword.")
        if raw_rtf[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1":
            raise TypeError("Data passed is a full MSG object. You must extract the encapsulated RTF body first.")
        if raw_rtf in (b'', ''):
            raise MalformedRtf("Data passed as raw RTF file is an empty string.")

Classes

class DeEncapsulator (raw_rtf: bytes, grammar: Optional[str] = None)

De-Encapsulating RTF converter of HTML/TEXT found in .msg files.

De-encapsulation enables previously encapsulated HTML and plain text content to be extracted and rendered as HTML and plain text instead of the encapsulating RTF content. After de-encapsulation, the HTML and plain text should differ only minimally from the original HTML or plain text content.

Parameters

raw_rtf: (bytes): It's the raw RTF file as bytes. grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.

Attributes

content
(bytes) The deencapsulated content no matter what format it is in. Populated by the deencapsulate function.
html
(bytes) The deencapsulated content IF it is HTML content. Populated by the set_content function.
text
(bytes) The deencapsulated content IF it is plain text content. Populated by the set_content function.
found_binary
List of dictionaries containing binary data extracted from the rtf file.
content_type
The type of content encapsulated in .rtf data (html or text). Populated by the get_content_type function.
full_tree
The full .rtf object parsed into an object Tree using the grammar. Populated by the parse_rtf function.
doc_tree
The document portion of the .rtf full_tree object.
raw_rtf
The raw encapsulated .rtf data in byte format.
grammar
The Lark parsing grammer used to parse the .rtf data.
content_type_token
The .rtf header token identifying the content type. (\fromhtml1 OR \fromtext)
parser
The lark parser. Should not need to be manipulated directly. But, useful for debugging and saving the parsed object.

Load in the Encapsulated test and setup the grammar used to parse the encapsulated RTF.

NOTE: This does not do the parsing in the init so that you can initiate the object and do the parsing step by step.

Parameters

raw_rtf: (bytes): It's the raw RTF string. grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.

Raises

TypeError
The raw_rtf data passed is not the correct type of data (string/byte string).
Expand source code
class DeEncapsulator():
    """De-Encapsulating RTF converter of HTML/TEXT found in .msg files.

De-encapsulation enables previously encapsulated HTML and plain text content to be extracted and rendered as HTML and plain text instead of the encapsulating RTF content. After de-encapsulation, the HTML and plain text should differ only minimally from the original HTML or plain text content.


Parameters:
    raw_rtf: (bytes): It's the raw RTF file as bytes.
    grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.

Attributes:
    content: (bytes) The deencapsulated content no matter what format it is in. Populated by the `deencapsulate` function.
    html: (bytes) The deencapsulated content IF it is HTML content. Populated by the `set_content` function.
    text: (bytes) The deencapsulated content IF it is plain text content. Populated by the `set_content` function.
    found_binary: List of dictionaries containing binary data extracted from the rtf file.
    content_type: The type of content encapsulated in .rtf data (html or text). Populated by the `get_content_type` function.
    full_tree: The full .rtf object parsed into an object Tree using the grammar. Populated by the `parse_rtf` function.
    doc_tree: The `document` portion of the .rtf full_tree object.
    raw_rtf: The raw encapsulated .rtf data in byte format.
    grammar: The Lark parsing grammer used to parse the .rtf data.
    content_type_token: The .rtf header token identifying the content type. (\\fromhtml1 OR \\fromtext)
    parser: The lark parser. Should not need to be manipulated directly. But, useful for debugging and saving the parsed object.
    """

    def __init__(self, raw_rtf:bytes, grammar: Union[str,None] = None):
        """Load in the Encapsulated test and setup the grammar used to parse the encapsulated RTF.

NOTE: This does not do the parsing in the init so that you can initiate the object and do the parsing step by step.

Parameters:
        raw_rtf: (bytes): It's the raw RTF string.
        grammar: (str): OPTIONAL - Lark parsing grammar which defines the RTF language. https://github.com/lark-parser/lark If you think my grammar is shoddy this is your chance to test out a better one and make a pull request.

Raises:
        TypeError: The raw_rtf data passed is not the correct type of data (string/byte string).
"""
        self.content: str
        self.content_type: str
        self.content_type_token: str
        self.parser: Any

        self.html: str
        self.text: str
        self.found_binary: list
        self.full_tree: Tree
        self.doc_tree: Tree
        self.catch_common_validation_issues(raw_rtf)
        if isinstance(raw_rtf, bytes):
            raw_rtf_bytes = raw_rtf
        else:
            raise TypeError("DeEncapssulator only accepts RTF files in string or byte-string formats")
        raw_rtf_bytes = raw_rtf_bytes.rstrip(b'\x00')
        raw_rtf_bytes = raw_rtf_bytes.replace(b'\r\n',b'\n')
        raw_rtf_bytes = raw_rtf_bytes.replace(b'\r',b'\n')
        self.raw_rtf: bytes = raw_rtf_bytes
        if grammar is not None:
            self.grammar: str = grammar
        else:
            self.grammar = make_concise_grammar()

    def deencapsulate(self):
        """De-encapsulate the RTF content loaded into the De-Encapsulator.

Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The `content` property will store the content no matter what format it is in. The `html` and `text` properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.)
        """
        stripped_data = strip_binary_objects(self.raw_rtf)
        non_binary_rtf = stripped_data[0]
        found_binary = stripped_data[1]
        if len(found_binary) > 0:
            self.found_binary = found_binary
            log.info("Binary data found and extracted from rtf file.")
        escaped_rtf = encode_escaped_control_chars(non_binary_rtf)
        if is_logger_on("RTFDE.transform_logger") is True:
            log_transformations(escaped_rtf)
        try:
            self.parse_rtf(escaped_rtf)
        except UnexpectedInput as _e:
            raise MalformedEncapsulatedRtf(f"Malformed encapsulated RTF discovered:") from _e
        Decoder = TextDecoder()
        Decoder.update_children(self.full_tree)
        self.get_doc_tree()
        self.validate_encapsulation()

        # remove htmlrtf escaped values
        htmlrtf_stripped = self.strip_htmlrtf_tokens()
        # Strips whitespace from control words
        control_stripped = StripControlWords().transform(htmlrtf_stripped)
        # Strip unused control chars
        special_stripper = StripUnusedSpecialCharacters()
        non_special_tree = special_stripper.transform(control_stripped)
        # Strip out non-visible RTF groups
        stripper = StripNonVisibleRTFGroups()
        stripped_tree = stripper.transform(non_special_tree)
        # Converts any remaining tokens
        cleaner = RTFCleaner(visit_tokens=True)
        cleaned_text = cleaner.transform(stripped_tree)

        self.content = cleaned_text
        self.set_content() # Populates self.html || self.text

    def validate_charset(self, fallback_to_default:bool =False) -> bytes:
        """Validate and return the RTF charset keyword from the RTF streams header.

Args:
        fallback_to_default (bool): Allows you to force the use of the default charset "\\ansi" if one is not found.

Raises:
        MalformedRtf: RTF stream does not include charset control word.

Returns:
        The RTF charset keyword from the RTF streams header.
"""
        main_headers = self.get_header_control_words_before_first_group()

        for token in main_headers:
            if token.value in [b'\\ansi', b'\\mac', b'\\pc', b'\\pca']:
                return token

        log.debug("Acceptable charset not found as the second token in the RTF stream. The control word for the character set must precede any plain text or any table control words. So, if this stream doesn't have one it is malformed or corrupted.")
        if fallback_to_default is False:
            raise MalformedRtf("RTF stream does not include charset control word.")

        log.warning("The fallback_to_default option on _get_charset is considered DANGEROUS if used on possibly malicious samples. Make sure you know what you are doing before using it.")
        log.info("Attempting to decode RTF using the default charset ansi. This is not recommended and could have unforeseen consequences for the resulting file and your systems security.")
        log.debug("You have a malformed RTF stream. Are you sure you really want to be parsing it? It might not just be corrupted. It could be maliciously constructed.")
        return b"\\ansi"

    def set_content(self):
        """Populate the html or text content based on the content type. Populates self.html and/or self.text variables."""
        self.content_type = self.get_content_type()
        if self.content_type == 'html':
            self.html = self.content
        else:
            self.text = self.content

    def get_doc_tree(self):
        """Extract the document portion of the .rtf full_tree object. Populates the classes doc_tree attribute.

Raises:
        ValueError: The .rtf document object is missing or mis-located in the .rtf's full_tree object.
"""
        if self.full_tree.children[1].data == "document":
            self.doc_tree = self.full_tree.children[1]
        else:
            raise ValueError("Document object in the wrong place after parsing.")

    def get_content_type(self):
        """Provide the type of content encapsulated in RTF.

NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data.

Raises:
        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
"""
        if self.content_type_token is None:
            self.validate_FROM_in_doc_header()
        elif self.content_type_token == b'\\fromhtml1':
            return 'html'
        elif self.content_type_token == b'\\fromtext':
            return "text"

        raise NotEncapsulatedRtf("Data is missing encapsulated content type header (the FROM header).")

    def validate_encapsulation(self):
        """Runs simple tests to validate that the file in question is an rtf document which contains encapsulation."""
        self.validate_rtf_doc_header(self.doc_tree)
        self.validate_charset()
        self.validate_FROM_in_doc_header()
        ansicpg = self.get_ansicpg_header()
        if ansicpg is not None: # ansicpg is not manditory
            validate_ansi_cpg(ansicpg.value)

    def get_ansicpg_header(self) -> Union[Token,None]:
        """Extract the ansicpg control word from the .rtf header.

Returns:
        A lark CONTROLWORD Token with the `\\ansicpg` value. Returns None if the `\\ansicpg` control word is not included as this is only required if there is Unicode which needs to be converted to ANSI within a .rtf file.
"""
        headers = self.get_header_control_words_before_first_group()
        for item in headers:
            if item.value.startswith(b'\\ansicpg'):
                return item
        return None

    def parse_rtf(self, rtf: str):
        """Parse RTF file's header and document and extract the objects within the RTF into a Tree. Populates the self.full_tree attribute.

Args:
        rtf: The .rtf string to parse with the projects lark grammar.
"""
        # Uncomment Lark debug argument if you want to enable logging.
        # Note, this not enable ALL lark debug logging.
        # To do that we would not be able to use the Lark convinence class which we are using here.
        self.parser = Lark(self.grammar,
                           parser='lalr',
                           keep_all_tokens=True,
                           use_bytes=True,
                           # debug=True,
                           propagate_positions=True)
        self.full_tree = self.parser.parse(rtf)
        if is_logger_on("RTFDE.transform_logger") is True:
            log_transformations(self.full_tree)


    def strip_htmlrtf_tokens(self) -> Tree:
        """Strip tokens from with htmlrtf regions of the doc_tree as they were not part of the original HTML content.

Returns:
        .rtf doc_tree stripped of all non-original tokens.
"""
        # remove htmlrtf escaped values
        delete_generator = get_stripped_HTMLRTF_values(self.doc_tree)
        tokens_to_delete = list(delete_generator)
        deleter = DeleteTokensFromTree(tokens_to_delete)
        htmlrtf_cleaned_tree = deleter.transform(self.doc_tree)
        return htmlrtf_cleaned_tree


    def get_header_control_words_before_first_group(self) -> list:
        """Extracts all the control words in the first 20 tokens of the document or all the tokens which occur before the first group (whichever comes first.)

This is used to extract initial header values for validation functions.

Returns:
        A list containing the header tokens in the .rtf data.
        """
        initial_control_words = []
        for token in self.doc_tree.children[:20]:
            if isinstance(token, Token):
                initial_control_words.append(token)
            else:
                return initial_control_words
        return initial_control_words


    def validate_FROM_in_doc_header(self):
        """Inspect the header to identify what type of content (html/plain text) is encapsulated within the document.

NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX

Raises:
        MalformedEncapsulatedRtf: The .rtf headers are malformed.
        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
        """
        cw_found = {"rtf1":False,
                    "from":False,
                    "fonttbl":False,
                    "malformed":False}
        # The de-encapsulating RTF reader SHOULD inspect no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. This means more than just control words.
        decoded_tree = StripControlWords().transform(self.doc_tree)
        first_ten_tokens = decoded_tree.children[:10]
        operating_tokens = []
        found_token = None
        for token in first_ten_tokens:
            if isinstance(token, Token):
                operating_tokens.append(token)
            else:
                operating_tokens += list(token.scan_values(lambda t: t.type == 'CONTROLWORD'))
        if is_logger_on("RTFDE.validation_logger") is True:
            log_validators(f"Header tokens being evaluated: {operating_tokens}")

        for token in operating_tokens:
            cw_found,found_token = self.check_from_token(token=token, cw_found=cw_found)
            if cw_found['from'] is True and cw_found["malformed"] is True:
                raise MalformedEncapsulatedRtf("RTF file looks like is was supposed to be encapsulated HTML/TEXT but the headers are malformed. Turn on debugging to see specific information")
            # Save content type token available for id-ing type of content later
            if found_token is not None:
                self.content_type_token = found_token

        if cw_found['from'] is False:
            log.debug("FROMHTML/TEXT control word not found in first 10 RTF tokens. This is not an HTML/TEXT encapsulated RTF document.")
            raise NotEncapsulatedRtf("FROMHTML/TEXT control word not found.")

    @staticmethod
    def check_from_token(token:Token, cw_found:dict) -> Tuple[Dict,Union[None,str]] :
        """Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the validate_FROM_in_doc_header function.

Args:
        token: The token to check for in the cw_found state dictionary.
        cw_found: The state dictionary which is used to track the position of the from token within the header.

        `cw_found = {"rtf1":<BOOL>, "from":<BOOL>, "fonttbl":<BOOL>, "malformed":<BOOL>}`


Returns:
        cw_found: Updated state dictionary
        found_token: The content_type_token found in the header.

        """
        from_cws = [b'\\fromhtml1', b'\\fromtext']
        # This control word MUST appear before the \fonttbl control word and after the \rtf1 control word, as specified in [MSFT-RTF].
        rtf1_cw = b"\\rtf1"
        found_token = None
        fonttbl_cw = b"\\fonttbl"
        if token.type == "CONTROLWORD":
            if token.value.strip() in from_cws:
                if cw_found['from'] is True:
                    cw_found["malformed"] = True
                    log.debug("Multiple FROM HTML/TXT tokens found in the header. This encapsulated RTF is malformed.")
                if cw_found['rtf1'] is True:
                    cw_found['from'] = True
                    found_token = token.value
                else:
                    log.debug("FROMHTML/TEXT control word found before rtf1 control word. That's not allowed in the RTF spec.")
                    cw_found['from'] = True
                    cw_found["malformed"] = True
            elif token.value.strip() == rtf1_cw:
                cw_found['rtf1'] = True
            elif token.value.strip() == fonttbl_cw:
                cw_found['fonttbl'] = True
                if cw_found['from'] is not True:
                    log.debug("\\fonttbl code word found before FROMTML/TEXT was defined. This is not allowed for encapsulated HTML/TEXT. So... this is not encapsulated HTML/TEXT or it was badly encapsulated.")
                    cw_found["malformed"] = True
        return cw_found, found_token


    @staticmethod
    def validate_rtf_doc_header(doc_tree: Tree):
        """Check if doc starts with a valid RTF header `\\rtf1`.

        "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\\rtf1")." - MS-OXRTFEX

Raises:
        MalformedRtf: The .rtf headers do not include \\rtf1.
"""
        first_token = doc_tree.children[0].value
        if first_token != b"\\rtf1":
            log.debug("RTF stream does not contain valid valid RTF document heading. The file must start with \"{\\rtf1\"")
            if is_logger_on("RTFDE.validation_logger") is True:
                log_validators(f"First child object in document tree is: {first_token!r}")
            raise MalformedRtf("RTF stream does not start with {\\rtf1")

    @staticmethod
    def catch_common_validation_issues(raw_rtf: AnyStr):
        """Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them.

Args:
        raw_rtf: A raw .rtf string or byte-string.

Raises:
        TypeError: The data passed is the wrong type of data.
        MalformedRtf: The data passed is not a correctly formatted .rtf string.
"""
        if isinstance(raw_rtf, BufferedReader):
            raise TypeError("Data passed as file pointer. DeEncapsulator only accepts byte objects.")
        if raw_rtf is None:
            raise TypeError("Data passed as raw RTF file is a null object `None` keyword.")
        if raw_rtf[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1":
            raise TypeError("Data passed is a full MSG object. You must extract the encapsulated RTF body first.")
        if raw_rtf in (b'', ''):
            raise MalformedRtf("Data passed as raw RTF file is an empty string.")

Static methods

def catch_common_validation_issues(raw_rtf: ~AnyStr)

Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them.

Args

raw_rtf
A raw .rtf string or byte-string.

Raises

TypeError
The data passed is the wrong type of data.
MalformedRtf
The data passed is not a correctly formatted .rtf string.
Expand source code
    @staticmethod
    def catch_common_validation_issues(raw_rtf: AnyStr):
        """Checks for likely common valid input mistakes that may occur when folks try to use this library and raises exceptions to try and help identify them.

Args:
        raw_rtf: A raw .rtf string or byte-string.

Raises:
        TypeError: The data passed is the wrong type of data.
        MalformedRtf: The data passed is not a correctly formatted .rtf string.
"""
        if isinstance(raw_rtf, BufferedReader):
            raise TypeError("Data passed as file pointer. DeEncapsulator only accepts byte objects.")
        if raw_rtf is None:
            raise TypeError("Data passed as raw RTF file is a null object `None` keyword.")
        if raw_rtf[:8] == b"\xd0\xcf\x11\xe0\xa1\xb1\x1a\xe1":
            raise TypeError("Data passed is a full MSG object. You must extract the encapsulated RTF body first.")
        if raw_rtf in (b'', ''):
            raise MalformedRtf("Data passed as raw RTF file is an empty string.")
def check_from_token(token: lark.lexer.Token, cw_found: dict) ‑> Tuple[Dict, Optional[None]]

Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the validate_FROM_in_doc_header function.

Args

token
The token to check for in the cw_found state dictionary.
cw_found
The state dictionary which is used to track the position of the from token within the header.

cw_found = {"rtf1":<BOOL>, "from":<BOOL>, "fonttbl":<BOOL>, "malformed":<BOOL>}

Returns

cw_found
Updated state dictionary
found_token
The content_type_token found in the header.
Expand source code
    @staticmethod
    def check_from_token(token:Token, cw_found:dict) -> Tuple[Dict,Union[None,str]] :
        """Checks if fromhtml1 or fromtext tokens are in the proper place in the header based on the state passed to it by the validate_FROM_in_doc_header function.

Args:
        token: The token to check for in the cw_found state dictionary.
        cw_found: The state dictionary which is used to track the position of the from token within the header.

        `cw_found = {"rtf1":<BOOL>, "from":<BOOL>, "fonttbl":<BOOL>, "malformed":<BOOL>}`


Returns:
        cw_found: Updated state dictionary
        found_token: The content_type_token found in the header.

        """
        from_cws = [b'\\fromhtml1', b'\\fromtext']
        # This control word MUST appear before the \fonttbl control word and after the \rtf1 control word, as specified in [MSFT-RTF].
        rtf1_cw = b"\\rtf1"
        found_token = None
        fonttbl_cw = b"\\fonttbl"
        if token.type == "CONTROLWORD":
            if token.value.strip() in from_cws:
                if cw_found['from'] is True:
                    cw_found["malformed"] = True
                    log.debug("Multiple FROM HTML/TXT tokens found in the header. This encapsulated RTF is malformed.")
                if cw_found['rtf1'] is True:
                    cw_found['from'] = True
                    found_token = token.value
                else:
                    log.debug("FROMHTML/TEXT control word found before rtf1 control word. That's not allowed in the RTF spec.")
                    cw_found['from'] = True
                    cw_found["malformed"] = True
            elif token.value.strip() == rtf1_cw:
                cw_found['rtf1'] = True
            elif token.value.strip() == fonttbl_cw:
                cw_found['fonttbl'] = True
                if cw_found['from'] is not True:
                    log.debug("\\fonttbl code word found before FROMTML/TEXT was defined. This is not allowed for encapsulated HTML/TEXT. So... this is not encapsulated HTML/TEXT or it was badly encapsulated.")
                    cw_found["malformed"] = True
        return cw_found, found_token
def validate_rtf_doc_header(doc_tree: lark.tree.Tree)

Check if doc starts with a valid RTF header \rtf1.

    "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\rtf1")." - MS-OXRTFEX

Raises

MalformedRtf
The .rtf headers do not include \rtf1.
Expand source code
    @staticmethod
    def validate_rtf_doc_header(doc_tree: Tree):
        """Check if doc starts with a valid RTF header `\\rtf1`.

        "Before the de-encapsulating RTF reader tries to recognize the encapsulation, the reader SHOULD ensure that the document has a valid RTF document heading according to [MSFT-RTF] (that is, it starts with the character sequence "{\\rtf1")." - MS-OXRTFEX

Raises:
        MalformedRtf: The .rtf headers do not include \\rtf1.
"""
        first_token = doc_tree.children[0].value
        if first_token != b"\\rtf1":
            log.debug("RTF stream does not contain valid valid RTF document heading. The file must start with \"{\\rtf1\"")
            if is_logger_on("RTFDE.validation_logger") is True:
                log_validators(f"First child object in document tree is: {first_token!r}")
            raise MalformedRtf("RTF stream does not start with {\\rtf1")

Methods

def deencapsulate(self)

De-encapsulate the RTF content loaded into the De-Encapsulator.

Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The content property will store the content no matter what format it is in. The html and text properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.)

Expand source code
    def deencapsulate(self):
        """De-encapsulate the RTF content loaded into the De-Encapsulator.

Once you have loaded in the raw rtf this function will set the properties containing the encapsulated content. The `content` property will store the content no matter what format it is in. The `html` and `text` properties will be populated based on the type of content that is extracted. (self.html will be populated if it is html and self.text if it is plain text.)
        """
        stripped_data = strip_binary_objects(self.raw_rtf)
        non_binary_rtf = stripped_data[0]
        found_binary = stripped_data[1]
        if len(found_binary) > 0:
            self.found_binary = found_binary
            log.info("Binary data found and extracted from rtf file.")
        escaped_rtf = encode_escaped_control_chars(non_binary_rtf)
        if is_logger_on("RTFDE.transform_logger") is True:
            log_transformations(escaped_rtf)
        try:
            self.parse_rtf(escaped_rtf)
        except UnexpectedInput as _e:
            raise MalformedEncapsulatedRtf(f"Malformed encapsulated RTF discovered:") from _e
        Decoder = TextDecoder()
        Decoder.update_children(self.full_tree)
        self.get_doc_tree()
        self.validate_encapsulation()

        # remove htmlrtf escaped values
        htmlrtf_stripped = self.strip_htmlrtf_tokens()
        # Strips whitespace from control words
        control_stripped = StripControlWords().transform(htmlrtf_stripped)
        # Strip unused control chars
        special_stripper = StripUnusedSpecialCharacters()
        non_special_tree = special_stripper.transform(control_stripped)
        # Strip out non-visible RTF groups
        stripper = StripNonVisibleRTFGroups()
        stripped_tree = stripper.transform(non_special_tree)
        # Converts any remaining tokens
        cleaner = RTFCleaner(visit_tokens=True)
        cleaned_text = cleaner.transform(stripped_tree)

        self.content = cleaned_text
        self.set_content() # Populates self.html || self.text
def get_ansicpg_header(self) ‑> Optional[lark.lexer.Token]

Extract the ansicpg control word from the .rtf header.

Returns

A lark CONTROLWORD Token with the \ansicpg value. Returns None if the \ansicpg control word is not included as this is only required if there is Unicode which needs to be converted to ANSI within a .rtf file.

Expand source code
    def get_ansicpg_header(self) -> Union[Token,None]:
        """Extract the ansicpg control word from the .rtf header.

Returns:
        A lark CONTROLWORD Token with the `\\ansicpg` value. Returns None if the `\\ansicpg` control word is not included as this is only required if there is Unicode which needs to be converted to ANSI within a .rtf file.
"""
        headers = self.get_header_control_words_before_first_group()
        for item in headers:
            if item.value.startswith(b'\\ansicpg'):
                return item
        return None
def get_content_type(self)

Provide the type of content encapsulated in RTF.

NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data.

Raises

NotEncapsulatedRtf
The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
Expand source code
    def get_content_type(self):
        """Provide the type of content encapsulated in RTF.

NOTE: This function will only work after the header validation has completed. Header validation also extracts the content type of the encapsulated data.

Raises:
        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
"""
        if self.content_type_token is None:
            self.validate_FROM_in_doc_header()
        elif self.content_type_token == b'\\fromhtml1':
            return 'html'
        elif self.content_type_token == b'\\fromtext':
            return "text"

        raise NotEncapsulatedRtf("Data is missing encapsulated content type header (the FROM header).")
def get_doc_tree(self)

Extract the document portion of the .rtf full_tree object. Populates the classes doc_tree attribute.

Raises

ValueError
The .rtf document object is missing or mis-located in the .rtf's full_tree object.
Expand source code
    def get_doc_tree(self):
        """Extract the document portion of the .rtf full_tree object. Populates the classes doc_tree attribute.

Raises:
        ValueError: The .rtf document object is missing or mis-located in the .rtf's full_tree object.
"""
        if self.full_tree.children[1].data == "document":
            self.doc_tree = self.full_tree.children[1]
        else:
            raise ValueError("Document object in the wrong place after parsing.")
def get_header_control_words_before_first_group(self) ‑> list

Extracts all the control words in the first 20 tokens of the document or all the tokens which occur before the first group (whichever comes first.)

This is used to extract initial header values for validation functions.

Returns

A list containing the header tokens in the .rtf data.

Expand source code
    def get_header_control_words_before_first_group(self) -> list:
        """Extracts all the control words in the first 20 tokens of the document or all the tokens which occur before the first group (whichever comes first.)

This is used to extract initial header values for validation functions.

Returns:
        A list containing the header tokens in the .rtf data.
        """
        initial_control_words = []
        for token in self.doc_tree.children[:20]:
            if isinstance(token, Token):
                initial_control_words.append(token)
            else:
                return initial_control_words
        return initial_control_words
def parse_rtf(self, rtf: str)

Parse RTF file's header and document and extract the objects within the RTF into a Tree. Populates the self.full_tree attribute.

Args

rtf
The .rtf string to parse with the projects lark grammar.
Expand source code
    def parse_rtf(self, rtf: str):
        """Parse RTF file's header and document and extract the objects within the RTF into a Tree. Populates the self.full_tree attribute.

Args:
        rtf: The .rtf string to parse with the projects lark grammar.
"""
        # Uncomment Lark debug argument if you want to enable logging.
        # Note, this not enable ALL lark debug logging.
        # To do that we would not be able to use the Lark convinence class which we are using here.
        self.parser = Lark(self.grammar,
                           parser='lalr',
                           keep_all_tokens=True,
                           use_bytes=True,
                           # debug=True,
                           propagate_positions=True)
        self.full_tree = self.parser.parse(rtf)
        if is_logger_on("RTFDE.transform_logger") is True:
            log_transformations(self.full_tree)
def set_content(self)

Populate the html or text content based on the content type. Populates self.html and/or self.text variables.

Expand source code
def set_content(self):
    """Populate the html or text content based on the content type. Populates self.html and/or self.text variables."""
    self.content_type = self.get_content_type()
    if self.content_type == 'html':
        self.html = self.content
    else:
        self.text = self.content
def strip_htmlrtf_tokens(self) ‑> lark.tree.Tree

Strip tokens from with htmlrtf regions of the doc_tree as they were not part of the original HTML content.

Returns

.rtf doc_tree stripped of all non-original tokens.

Expand source code
    def strip_htmlrtf_tokens(self) -> Tree:
        """Strip tokens from with htmlrtf regions of the doc_tree as they were not part of the original HTML content.

Returns:
        .rtf doc_tree stripped of all non-original tokens.
"""
        # remove htmlrtf escaped values
        delete_generator = get_stripped_HTMLRTF_values(self.doc_tree)
        tokens_to_delete = list(delete_generator)
        deleter = DeleteTokensFromTree(tokens_to_delete)
        htmlrtf_cleaned_tree = deleter.transform(self.doc_tree)
        return htmlrtf_cleaned_tree
def validate_FROM_in_doc_header(self)

Inspect the header to identify what type of content (html/plain text) is encapsulated within the document.

NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX

Raises

MalformedEncapsulatedRtf
The .rtf headers are malformed.
NotEncapsulatedRtf
The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
Expand source code
    def validate_FROM_in_doc_header(self):
        """Inspect the header to identify what type of content (html/plain text) is encapsulated within the document.

NOTE: The de-encapsulating RTF reader inspects no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. If one of the control words is the FROMHTML control word, the de-encapsulating RTF reader will conclude that the RTF document contains an encapsulated HTML document and stop further inspection. If one of the control words is the FROMTEXT control word, the de-encapsulating RTF reader concludes that the RTF document was produced from a plain text document and stops further inspection. - MS-OXRTFEX

Raises:
        MalformedEncapsulatedRtf: The .rtf headers are malformed.
        NotEncapsulatedRtf: The .rtf object is missing an encapsulated content type header. Which means that it is likely just a regular .rtf file.
        """
        cw_found = {"rtf1":False,
                    "from":False,
                    "fonttbl":False,
                    "malformed":False}
        # The de-encapsulating RTF reader SHOULD inspect no more than the first 10 RTF tokens (that is, begin group marks and control words) in the input RTF document, in sequence, starting from the beginning of the RTF document. This means more than just control words.
        decoded_tree = StripControlWords().transform(self.doc_tree)
        first_ten_tokens = decoded_tree.children[:10]
        operating_tokens = []
        found_token = None
        for token in first_ten_tokens:
            if isinstance(token, Token):
                operating_tokens.append(token)
            else:
                operating_tokens += list(token.scan_values(lambda t: t.type == 'CONTROLWORD'))
        if is_logger_on("RTFDE.validation_logger") is True:
            log_validators(f"Header tokens being evaluated: {operating_tokens}")

        for token in operating_tokens:
            cw_found,found_token = self.check_from_token(token=token, cw_found=cw_found)
            if cw_found['from'] is True and cw_found["malformed"] is True:
                raise MalformedEncapsulatedRtf("RTF file looks like is was supposed to be encapsulated HTML/TEXT but the headers are malformed. Turn on debugging to see specific information")
            # Save content type token available for id-ing type of content later
            if found_token is not None:
                self.content_type_token = found_token

        if cw_found['from'] is False:
            log.debug("FROMHTML/TEXT control word not found in first 10 RTF tokens. This is not an HTML/TEXT encapsulated RTF document.")
            raise NotEncapsulatedRtf("FROMHTML/TEXT control word not found.")
def validate_charset(self, fallback_to_default: bool = False) ‑> bytes

Validate and return the RTF charset keyword from the RTF streams header.

Args

fallback_to_default : bool
Allows you to force the use of the default charset "\ansi" if one is not found.

Raises

MalformedRtf
RTF stream does not include charset control word.

Returns

The RTF charset keyword from the RTF streams header.

Expand source code
    def validate_charset(self, fallback_to_default:bool =False) -> bytes:
        """Validate and return the RTF charset keyword from the RTF streams header.

Args:
        fallback_to_default (bool): Allows you to force the use of the default charset "\\ansi" if one is not found.

Raises:
        MalformedRtf: RTF stream does not include charset control word.

Returns:
        The RTF charset keyword from the RTF streams header.
"""
        main_headers = self.get_header_control_words_before_first_group()

        for token in main_headers:
            if token.value in [b'\\ansi', b'\\mac', b'\\pc', b'\\pca']:
                return token

        log.debug("Acceptable charset not found as the second token in the RTF stream. The control word for the character set must precede any plain text or any table control words. So, if this stream doesn't have one it is malformed or corrupted.")
        if fallback_to_default is False:
            raise MalformedRtf("RTF stream does not include charset control word.")

        log.warning("The fallback_to_default option on _get_charset is considered DANGEROUS if used on possibly malicious samples. Make sure you know what you are doing before using it.")
        log.info("Attempting to decode RTF using the default charset ansi. This is not recommended and could have unforeseen consequences for the resulting file and your systems security.")
        log.debug("You have a malformed RTF stream. Are you sure you really want to be parsing it? It might not just be corrupted. It could be maliciously constructed.")
        return b"\\ansi"
def validate_encapsulation(self)

Runs simple tests to validate that the file in question is an rtf document which contains encapsulation.

Expand source code
def validate_encapsulation(self):
    """Runs simple tests to validate that the file in question is an rtf document which contains encapsulation."""
    self.validate_rtf_doc_header(self.doc_tree)
    self.validate_charset()
    self.validate_FROM_in_doc_header()
    ansicpg = self.get_ansicpg_header()
    if ansicpg is not None: # ansicpg is not manditory
        validate_ansi_cpg(ansicpg.value)