From 7c03a4c092d9cba10ecb22e7f97aaca851259f1f Mon Sep 17 00:00:00 2001
From: Gülşah Köse <gulsah.kose@collabora.com>
Date: Tue, 8 Jun 2021 08:43:30 +0300
Subject: Improve error handling, exclude None texts

Change-Id: Idedad9c414311d95c8888355ea70a913f8e0ddf7
---
 bin/ooxml-analyze.py | 103 ++++++++++++++++++++++++++++-----------------------
 1 file changed, 57 insertions(+), 46 deletions(-)

diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index a7e2bc2a549f..87acd377c854 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -88,8 +88,11 @@ def extract_files(inputdir, extracted_files_dir):
             filepath = os.path.join(inputdir, filename)
             extracted_file_path = os.path.join(extracted_files_dir, filename)
 
-            with ZipFile(filepath) as zipObj:
-                zipObj.extractall(extracted_file_path)
+            try:
+                with ZipFile(filepath) as zipObj:
+                    zipObj.extractall(extracted_file_path)
+            except:
+                print("%s is problematic" % filename)
         else:
             continue
 
@@ -126,6 +129,11 @@ def is_file_in_accepted_files(filename):
        "ppt/slideMasters" in filename or \
        "ppt/theme" in filename or \
        "ppt/notesMasters" in filename or \
+       "ppt/notesSlides" in filename or \
+       "ppt/handoutMasters" in filename or \
+       "ppt/tags" in filename or \
+       "pptx/customXml" in filename or \
+       "ppt/diagrams" in filename or \
        filename.endswith("docProps/core.xml") or not \
        filename.endswith(".xml")):
        return False
@@ -149,51 +157,54 @@ def count_elements(extracted_files_dir, result_list, concanated_texts_list):
 
         print(xmlfile)
 
-        # start to count
-        for event, child in etree.iterparse(xmlfile, events=('start', 'end')):
-            tag = replace_namespace_with_alias(xmlfile, child.tag)
-            tag_idx = get_index_of_tag(tag, result_list)
-
-            if event == "start":
-                # count tags
-                if (tag_idx == -1):
-                    tmp_list = [{tag: 1},{},{},{}]
-                    result_list.append(tmp_list)
-                else:
-                    result_list[tag_idx][0][tag] += 1
-
-                # count attribute names and values of current tag
-                for attr_name, attr_value in child.attrib.items():
-                    attr_name = replace_namespace_with_alias(xmlfile, attr_name)
-                    if not attr_name in result_list[tag_idx][1].keys():
-                        result_list[tag_idx][1][attr_name] = 1
+        try:
+            # start to count
+            for event, child in etree.iterparse(xmlfile, events=('start', 'end')):
+                tag = replace_namespace_with_alias(xmlfile, child.tag)
+                tag_idx = get_index_of_tag(tag, result_list)
+
+                if event == "start":
+                    # count tags
+                    if (tag_idx == -1):
+                        tmp_list = [{tag: 1},{},{},{}]
+                        result_list.append(tmp_list)
                     else:
-                        result_list[tag_idx][1][attr_name] +=1
-
-                    if not attr_value in result_list[tag_idx][2].keys():
-                        result_list[tag_idx][2][attr_value] = 1
-                    else:
-                        result_list[tag_idx][2][attr_value] +=1
-
-                # concanated text will be resetted in every paragraph begining
-                if tag == "a:p":
-                    concatenated_text = ""
-
-
-            if event == "end":
-                # Detect seperate texts in paragraph and concanate them.
-                if tag == "a:t":
-                    concatenated_text += str(child.text)
-                # End of the paragraph element, add the text as list item.
-                if tag == "a:p" and concatenated_text != "":
-                    concanated_texts_list.append(concatenated_text)
-
-                # count text contents except consisted of whitespaces.
-                if not (str(child.text) == "None" or str(child.text).strip()==""):
-                    if not child.text in result_list[tag_idx][3].keys():
-                        result_list[tag_idx][3][child.text] = 1
-                    else:
-                        result_list[tag_idx][3][child.text] += 1
+                        result_list[tag_idx][0][tag] += 1
+
+                    # count attribute names and values of current tag
+                    for attr_name, attr_value in child.attrib.items():
+                        attr_name = replace_namespace_with_alias(xmlfile, attr_name)
+                        if not attr_name in result_list[tag_idx][1].keys():
+                            result_list[tag_idx][1][attr_name] = 1
+                        else:
+                            result_list[tag_idx][1][attr_name] +=1
+
+                        if not attr_value in result_list[tag_idx][2].keys():
+                            result_list[tag_idx][2][attr_value] = 1
+                        else:
+                            result_list[tag_idx][2][attr_value] +=1
+
+                    # concanated text will be resetted in every paragraph begining
+                    if tag == "a:p":
+                        concatenated_text = ""
+
+
+                if event == "end":
+                    # Detect seperate texts in paragraph and concanate them.
+                    if tag == "a:t" and str(child.text) != "None":
+                        concatenated_text += str(child.text)
+                    # End of the paragraph element, add the text as list item.
+                    if tag == "a:p" and concatenated_text != "":
+                        concanated_texts_list.append(concatenated_text)
+
+                    # count text contents except consisted of whitespaces.
+                    if not (str(child.text) == "None" or str(child.text).strip()==""):
+                        if not child.text in result_list[tag_idx][3].keys():
+                            result_list[tag_idx][3][child.text] = 1
+                        else:
+                            result_list[tag_idx][3][child.text] += 1
+        except Exception as exception:
+            print("%s has %s " % xmlfile, exception)
 
 # gets the position of "tag" element in result list. If element is not exist,
 # return -1 that points the last index of the list.
-- 
cgit