Fix the use of exist extracted files path and change result output

Tool was counting the text context that consists of whitespaces. Prevent this, not count that texts as text contexts eg: " " Change-Id: Ib71123b82082166addd423b734661a158ec2254e
author: Gülşah Köse <gulsah.kose@collabora.com> 2021-05-31 16:08:23 +0300
committer: Gülşah Köse <gulsah.kose@collabora.com> 2021-05-31 16:08:28 +0300
commit: 056ebfae35f6725b9089439a7bf868dad48fdd0f (patch)
tree: 5371bb683a29e987fc2d0266fd04aed566b6d4d2
parent: 076e9b84985ef81912d5f58da241d790ae17ed33 (diff)
1 files changed, 22 insertions, 16 deletions
diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py
index 12b9ba590db9..9db39d8c47da 100755
--- a/bin/ooxml-analyze.py
+++ b/bin/ooxml-analyze.py
@@ -8,6 +8,7 @@ def main(argv):
     inputdir = ''
     outputdir = ''
     extracted_files_dir_by_user = ''
+    extracted_files_dir = ''
 
     #read the arguments
     try:
@@ -34,23 +35,27 @@ def main(argv):
         # use default directory path for extracted ooxml files.
         extracted_files_dir = os.path.join(outputdir, 'extractedfiles')
         extract_files(inputdir, extracted_files_dir)
-
-        # create seperate result files for each ooxml document as <document name>.result in output directory
-        for ext_dir in get_list_of_subdir(extracted_files_dir):
-            i = ext_dir.rfind('/')
-            sub_result_name = ext_dir[i+1:] + ".result"
-            sub_result_list = []
-            count_elements(ext_dir, sub_result_list)
-            sub_result_path = os.path.join(outputdir, sub_result_name)
-
-            # sort the result sub list according to tag names
-            sub_result_list = sorted(sub_result_list, key=lambda x: list(x[0].keys())[0], reverse=False)
-
-            with open(sub_result_path, "w") as log_file:
-                pprint.pprint(sub_result_list, log_file)
     else:
         # use user defined directory path for extracted ooxml files.
-        count_elements(extracted_files_dir_by_user, result_list)
+        extracted_files_dir = extracted_files_dir_by_user
+
+    # create seperate result files for each ooxml document as <document name>.result in output directory
+    for ext_dir in get_list_of_subdir(extracted_files_dir):
+        i = ext_dir.rfind('/')
+        sub_result_name = ext_dir[i+1:] + ".result"
+        sub_result_list = []
+        count_elements(ext_dir, sub_result_list)
+        sub_result_path = os.path.join(outputdir, sub_result_name)
+
+        # sort the result sub list according to tag names
+        sub_result_list = sorted(sub_result_list, key=lambda x: list(x[0].keys())[0], reverse=False)
+
+        if os.path.exists(sub_result_path):
+            os.remove(sub_result_path)
+        for i in sub_result_list:
+            with open(sub_result_path, "a") as log_file:
+                print(i, file=log_file)
+                log_file.close()
 
 # unzip all ooxml files into the given path
 def extract_files(inputdir, extracted_files_dir):
@@ -154,7 +159,8 @@ def count_elements(extracted_files_dir, result_list):
                 else:
                     result_list[tag_idx][2][attr_value] +=1
 
-            if not (str(child.text) == "None"):
+            # count text contents except consisted of whitespaces.
+            if not (str(child.text) == "None" or str(child.text).strip()==""):
                 if not child.text in result_list[tag_idx][3].keys():
                     result_list[tag_idx][3][child.text] = 1
                 else:
author	Gülşah Köse <gulsah.kose@collabora.com>	2021-05-31 16:08:23 +0300
committer	Gülşah Köse <gulsah.kose@collabora.com>	2021-05-31 16:08:28 +0300
commit	056ebfae35f6725b9089439a7bf868dad48fdd0f (patch)
tree	5371bb683a29e987fc2d0266fd04aed566b6d4d2
parent	076e9b84985ef81912d5f58da241d790ae17ed33 (diff)