r"""
modul oodiff - Open Office Text diff

It contains class OODiff.
"""

import misc, zipfile, re, textdiff, sys

class OODiff():
    r"""Class, which extracts text from odt files, compares it and shows
    differences.

    Odt files are opened, unzipped and file content.xml is read as text. From
    text are deleted formating marks. Special character entites are replaced.
    Pure text is compared and differences are showed in ndiff output format.
    """
    def __init__(self,file1_name,file2_name, options):
        r"""Constructs OODiff instance.

        Opens files `file1_name` and `file2_name` and reads text from them.

        parameters:
        file1_name - name of the first file
        file2_name - name of the second file
        param - options from optparse

        instance variables:
        self.file1_name - name of the first file
        self.file2_name - name of the second file
        self.text1 - text of the first file
        self.text2 - text of the second file
        self.line_length - length of line in characters
        self.options - options from optparse

        """
        self.line_length = options.line_width
        self.options = options
        self.file1_name = file1_name
        self.file2_name = file2_name

        self.text1 = self.get_text(self.read_odt(self.file1_name))
        self.text2 = self.get_text(self.read_odt(self.file2_name))

    def read_odt(self,file_name):
        """Unzips odt document and reads content.xml"""
        try:
            document = zipfile.ZipFile(file_name)
            content = document.open('content.xml')
            document.close()
        except zipfile.BadZipfile as err:
            sys.stderr.write("Bad odt file: {0}\n".format(err))
            sys.exit(1)
        except IOError as err:
            sys.stderr.write("I/O error: {0}\n".format(err))
            sys.exit(1)

        return str(content.read(), "utf-8")

    def get_text(self,content):
        r"""Returns the text without formating marks

        Using the regular expressions, the text is parsed. Formating marks are removed and
        special character entites are substituted. Pure text is returned.
        """
        pat_erase_marks = re.compile(r'<.+?>')
        pat_erase_block_marks = re.compile(r'</text:[hp]>')
        pat_erase_block_marks2 = re.compile(r'<text:[hp][^<]*/>')
        pat_replace_page_breaks = re.compile(r'<text:soft-page-break/>')


        pat_replace_char_entits1 = re.compile(r'&lt;')
        pat_replace_char_entits2 = re.compile(r'&gt;')
        pat_replace_char_entits3 = re.compile(r'&amp;')
        pat_replace_char_entits4 = re.compile(r'&apos;')
        pat_replace_char_entits5 = re.compile(r'&quot;')


        pat_replace_table = re.compile(r'<table:table.*?</table:table>')
        def replace_table(text):
            table = text.group(0)
            pat_erase_block_marks = re.compile(r'</text:[hp]>')
            pat_erase_block_marks2 = re.compile(r'<text:[hp][^<]*/>')
            pat_replace_table_row = re.compile(r'</table:table-row>')
            pat_replace_table_cell = re.compile(r'</table:table-cell>')

            table = re.sub(pat_erase_block_marks,' ',table)
            table = re.sub(pat_erase_block_marks2,' ',table)
            table = re.sub(pat_replace_table_row,'\n',table)
            table = re.sub(pat_replace_table_cell,' ',table)

            return table


        content = re.sub(pat_replace_table,replace_table,content)
        content = re.sub(pat_erase_block_marks,'\n',content)
        content = re.sub(pat_erase_block_marks2,'\n',content)
        content = re.sub(pat_replace_page_breaks,('-'*self.line_length)+'\n',content)
        content = re.sub(pat_erase_marks,'',content)

        content = re.sub(pat_replace_char_entits1,'<',content)
        content = re.sub(pat_replace_char_entits2,'>',content)
        content = re.sub(pat_replace_char_entits3,'&',content)
        content = re.sub(pat_replace_char_entits4,'\'',content)
        content = re.sub(pat_replace_char_entits5,'"',content)


        return content

    def make_list_of_lines(self,text):
        """From text makes list of lines."""
        lines = text.split('\n')
        result = []

        for line in lines:
            if len(line) <= self.line_length:
                result.append(line + '\n')
            else:
                for i in range(len(line) // self.line_length + 1):
                    result.append(line[i*self.line_length:(i+1)*self.line_length] + '\n')

        return result


    def compare(self):
        """Compares two texts according to parameters."""
        text1 = self.make_list_of_lines(self.get_text(self.read_odt(self.file1_name)))
        text2 = self.make_list_of_lines(self.get_text(self.read_odt(self.file2_name)))

        if self.options.standard_diff:
            return textdiff.TextDiff(self.file1_name,self.file2_name,self.options,text1,text2).gnu_diff()
        elif self.options.context_diff:
            return textdiff.TextDiff(self.file1_name,self.file2_name,self.options,text1,text2).context_diff()
        elif self.options.unified_diff:
            return textdiff.TextDiff(self.file1_name,self.file2_name,self.options,text1,text2).unified_diff()
        else:
            return textdiff.TextDiff(self.file1_name,self.file2_name,self.options,text1,text2).normal_diff()

