Source code for eezz.document


"""
This module implements the following classes

    * :py:class:`eezz.document.TManifest`: The Manifest contains the attributes and the content of a document.\
    This includes for example author, creation date and embedded external files.
    * :py:class:`eezz.document.TDocument`:  A document consists of one or more embedded files and the Manifest.\
    The class implements methods for file download and creating a TAR archive.

    A document has always a reference to a shelf, which contains documents with the same Manifest layout

"""
import  re
import  time
import  tarfile
import  json
from    loguru          import logger

from    abc             import abstractmethod
from    io              import BytesIO
from    eezz.filesrv     import TFile, TFileMode
from    service         import TService
from    pathlib         import Path
from    dataclasses     import dataclass
from    math            import floor
from    typing          import List, Dict, override


@dataclass(kw_only=True)
class TManifest:
    """ The manifest represents the information for the document. It defines a solid way to ensures a consistent
    structure for parsing the internal attributes.
    """
    keys_section_header: list
    keys_section_doc:    list = None
    keys_section_files:  list = None
    structure_document:  dict = None
    map_files:           dict = None

    def __post_init__(self):
        # Prepare consistent access of manifest to database
        self.keys_section_doc    = ['document', 'files', 'signature']
        self.keys_section_files  = ['source', 'name', 'size', 'type']
        self.structure_document  = {'document': {}, 'files': []}

    @property
    def document(self) -> dict:
        """:meta private:"""
        return self.structure_document['document']

    @document.setter
    def document(self, value: dict):
        """:meta private:"""
        self.map_files = dict()
        x_document_descr = {x: value.get(x, '') for x in self.keys_section_header}
        self.structure_document['document'] = x_document_descr

    @property
    def files(self) -> list:
        """:meta private:"""
        return self.structure_document['files']

    def append_file(self, file: dict):
        """:meta private:"""
        if not self.map_files.get(file['source']):
            self.map_files[file['source']] = list()

        x_file_descr = {x: file.get(x, '') for x in self.keys_section_files}
        self.structure_document['files'].append(x_file_descr)
        self.map_files[file['source']].append(file['name'])

    @property
    def column_names(self) -> list:
        """:meta private:"""
        return self.keys_section_header

    @override
    def __str__(self):
        for x, y in self.map_files.items():
            self.structure_document['document'][x] = y
        return json.dumps(self.structure_document, indent=4)

    def loads(self, manifest_str):
        """:meta private:"""
        self.structure_document = json.loads(manifest_str)



[docs]
@dataclass(kw_only=True)
class TDocument:
    """ Manages documents
    A document is a zipped TAR file, w  ith a Metafile and a collection of data files.

    :ivar Path          path:            Documents bookshelf path
    :ivar List[str]     attributes:      List of attributes like author and title
    :ivar str           shelf_name:      A document has always a reference to a bookshelf
    :ivar TManifest     manifest:        Document header definition
    :ivar List[TFile]   files_list:      List of embedded files
    """
    shelf_name:     str                             #: :meta private:
    attributes:     List[str]                       #: List of document attributes like author and title
    manifest:       TManifest           = None      #: :meta private:
    path:           Path                = None      #: :meta private:
    count:          int                 = 0         #: :meta private:
    finished:       bool                = False     #: :meta private:
    file_sources:   List[str]           = None      #: :meta private:
    files_transferred: int              = 0
    map_files:      Dict[str, TFile]    = None      #: :meta private:
    map_source:     Dict[str, List[TFile]] = None   #: :meta private:
    transferred:    int                 = 0         #: :meta private:
    title:          str                 = ''        #: :meta private:

    def __post_init__(self):
        """ combine attributes:
        The mandatory attribute "title" is inserted at the start,
        the file sources at the end

        The file sources might have one or more references, which are represented as list of files in the Manif<est
        """
        self.attributes = [x for x in self.attributes]

        if self.file_sources:
            self.attributes += self.file_sources

        if not self.path:
            self.path = TService().document_path
        self.manifest = TManifest(keys_section_header=self.attributes)


[docs]
    def initialize_document(self, values: list) -> None:
        """ Initializes the document, providing values for the Manifest header

        :param List[str] values:  List of values according to the definition of columns
        """
        self.finished           = False
        self.count              = 0
        self.manifest.document  = {x: y for x, y in zip(self.attributes, values)}
        self.map_source         = dict()
        self.map_files          = dict()
        self.transferred        = 0
        self.title              = values[0]



[docs]
    def download_file(self, file: dict, stream: bytes = b'') -> bytes:
        """ Download file callback.
        The method is called for each chunk and for a final acknowledge. If all files
        are transferred, the document is created.
        For each final acknowledge a 100% is returned.

        :param dict      file:   File descriptor with details on file and byte stream
        :param bytearray stream: File stream, usually a chunk of
        :return: The percentage of transferred document size as bytearray, terminated with the percent sign
        :rtype:  bytearray

        """
        if file['opcode'] == 'finished':
            # Check if we got all elements of a single source:
            if self.transferred == file['all_volume']:
                self.create_document()
                self.finished = True
                return '100%'.encode('utf8')
            x_fraction = 100 * len(self.map_source[file['source']]) / int(file['src_files'])
            return f'{x_fraction}%'.encode('utf8')

        # Manage the file sources: Each source may have many entries
        # Drag hte number of successful loaded elements of this specific source
        if not self.map_source.get(file['source']):
            self.map_source[file['source']] = list()

        if not self.map_files.get(file['name']):
            x_path  = TService().public_path / self.title
            x_path.mkdir(exist_ok=True)
            x_path /= file['name']

            xt_file = TFile(file_type=file['source'], size = file['size'], chunk_size = file['chunk_size'], destination = x_path)
            self.map_files[file['name']] = xt_file
            self.map_source[file['source']].append(xt_file)
            self.manifest.append_file(file)

        xt_file = self.map_files[file['name']]
        xt_file.write(stream, file['sequence'], mode = TFileMode.NORMAL)

        # return percentage for a specific source
        self.transferred += xt_file.transferred

        x_src_volume = 0
        for xt_f in self.map_source[file['source']]:
            x_src_volume += xt_f.transferred

        x_percent  = 100 * x_src_volume / int(file['src_volume'])
        return f'{x_percent}%'.encode('utf8')



[docs]
    @abstractmethod
    def create_document(self):
        """ Abstract method which is called after all files are in place """
        pass



[docs]
    def create_archive(self, document_title: str) -> None:
        """ ZIP the given files and the manifest to a document.
        The TFile class keeps track on the location of the file content and their properties.

        :param str document_title: The name of the archive
        """
        x_zip_stream   = BytesIO()
        x_zip_stream.write(str(self.manifest).encode('utf-8'))
        x_zip_root     = Path('.')
        # Path is: destination / book / document
        x_destination  = self.path / f'{self.shelf_name}/{document_title}.tar'
        x_destination.parent.mkdir(exist_ok=True)

        with tarfile.TarFile(x_destination, "w") as x_zip_file:
            # store the info at the start of the tar file
            x_entry_path       = Path(x_zip_root) / 'Manifest'
            x_tar_info         = tarfile.TarInfo(name=str(x_entry_path))
            x_tar_info.size    = x_zip_stream.tell()
            x_tar_info.mtime   = floor(time.time())

            x_zip_stream.seek(0)
            x_zip_file.addfile(tarinfo=x_tar_info, fileobj=x_zip_stream)

            # Sort the files by source for output zip:
            for x_name, x_file in self.map_files.items():
                x_entry_path     = Path(x_zip_root) / x_file.destination.name
                x_source_path    = x_file.destination
                x_stat_info      = x_source_path.stat()
                x_tar_info       = tarfile.TarInfo(name=str(x_entry_path))
                x_tar_info.size  = x_stat_info.st_size
                x_tar_info.mtime = floor(time.time())

                with x_source_path.open("rb") as x_input:
                    x_zip_file.addfile(tarinfo=x_tar_info, fileobj=x_input)



[docs]
    def read_file(self, document_title: str, file_name: str) -> bytes:
        """ Returns the bytestream of the specified file in the archive

        :param str document_title:  The title of the document is the name of the archive
        :param str file_name:       The file content to return
        """
        x_source = self.path / f'{self.shelf_name}/{document_title}.tar'
        with tarfile.TarFile(x_source, "r") as x_zip_file:
            for x_tar_info in x_zip_file.getmembers():
                x_dest = Path(x_tar_info.name)
                if x_dest.name == file_name:
                    if x_buffer := x_zip_file.extractfile(x_tar_info):
                        return x_buffer.read()



[docs]
    def extract_file(self, document_title: str, file_pattern: str = None, dest_root: Path = '.') -> None:
        """ Restores the specified files, given by the regular expression in file_pattern

        :param str  document_title: The document title is the name of the archive
        :param Path dest_root:      The path within the archive for all entries
        :param str  file_pattern:   The files to extract
        """
        if not file_pattern:
            file_pattern = r'\S*'

        x_source = self.path / f'{self.shelf_name}/{document_title}.tar'
        with tarfile.TarFile(x_source, "r") as x_zip_file:
            for x_tar_info in x_zip_file.getmembers():
                x_dest = Path(x_tar_info.name)
                if re.search(file_pattern, x_dest.name):
                    x_local_file = dest_root / x_dest
                    if x_buffer := x_zip_file.extractfile(x_tar_info):
                        with x_local_file.open('wb') as file:
                            file.write(x_buffer.read())




# -- section for module tests
def test_document():
    """:meta private:"""
    logger.debug('Test class Document')
    my_doc = TDocument(shelf_name='First', attributes=['title', 'desc', 'author', 'price', 'valid'], file_sources=['main'])
    logger.success('test document')


if __name__ == '__main__':
    """:meta private:"""
    TService.set_environment(root_path='/Users/alzer/Projects/github/eezz_full/webroot')
    test_document()