Compare texts

Change-Id: I0aee62ffa79dbf716bb9ec2af2bc06078e1eb566
author: Gülşah Köse <gulsah.kose@collabora.com> 2021-06-08 08:50:37 +0300
committer: Gülşah Köse <gulsah.kose@collabora.com> 2021-06-08 08:50:37 +0300
commit: 4bf4b28b2d6316a34d0966d1109b76162aa0b336 (patch)
tree: ceae35e7e05ff81e10490ab230f85594348d5cc1
parent: 7c03a4c092d9cba10ecb22e7f97aaca851259f1f (diff)
1 files changed, 68 insertions, 19 deletions
diff --git a/bin/compare-ooxml-analyze-results.py b/bin/compare-ooxml-analyze-results.py
index d0d89fc30a31..fdd4fa8a0243 100644
--- a/bin/compare-ooxml-analyze-results.py
+++ b/bin/compare-ooxml-analyze-results.py
@@ -1,6 +1,6 @@
 #!/usr/bin/python
 
-import sys, getopt, os, pprint, ast
+import sys, getopt, os, pprint, ast, difflib
 
 original_results_dir = ''
 saved_results_dir = ''
@@ -24,13 +24,18 @@ def main(argv):
          global saved_results_dir
          saved_results_dir = arg
 
-   # takes file list produced by ooxml-analyze.py tool.
-   original_result_files = get_list_of_files(original_results_dir)
-   saved_result_files = get_list_of_files(saved_results_dir)
+   # takes result file list produced by ooxml-analyze.py tool. <filename>.result
+   original_result_files = get_list_of_result_files(original_results_dir)
+   saved_result_files = get_list_of_result_files(saved_results_dir)
+   compare_results(original_result_files, saved_result_files)
 
-   compare(original_result_files, saved_result_files)
+   # takes concanated texts file list produced by ooxml-analyze.py tool. <filename>.text
+   original_text_files = get_list_of_text_files(original_results_dir)
+   saved_text_files = get_list_of_text_files(saved_results_dir)
+   compare_texts(original_text_files, saved_text_files)
 
-def get_list_of_files(directory_name):
+# collects <filename>.result files
+def get_list_of_result_files(directory_name):
 
    list_of_file = os.listdir(directory_name)
    all_files = list()
@@ -38,16 +43,32 @@ def get_list_of_files(directory_name):
    for filename in list_of_file:
       full_path = os.path.join(directory_name, filename)
       if os.path.isdir(full_path):
-         all_files = all_files + get_list_of_files(full_path)
+         all_files = all_files + get_list_of_result_files(full_path)
       else:
-         all_files.append(full_path)
+         if filename.endswith(".result"):
+            all_files.append(full_path)
+
+   return all_files
+
+# collecsts <filename>.text files
+def get_list_of_text_files(directory_name):
+
+   list_of_file = os.listdir(directory_name)
+   all_files = list()
+
+   for filename in list_of_file:
+      full_path = os.path.join(directory_name, filename)
+      if os.path.isdir(full_path):
+         all_files = all_files + get_list_of_result_files(full_path)
+      else:
+         if filename.endswith(".text"):
+            all_files.append(full_path)
 
    return all_files
 
 
 # compares the  elements the original results and and after saved results.
-def compare(original_result_files, saved_result_files):
-   ind = 1
+def compare_results(original_result_files, saved_result_files):
    for original_filepath in original_result_files:
       saved_filepath = get_corresponding_file(original_filepath)
       if saved_filepath == '':
@@ -57,11 +78,31 @@ def compare(original_result_files, saved_result_files):
       original_result_list = create_list_from_result_file(original_filepath)
       saved_result_list = create_list_from_result_file(saved_filepath)
 
-      check_text_contents(original_result_list, saved_result_list)
+      check_text_contents(original_result_list, saved_result_list, original_filepath)
+
+def compare_texts(original_texts_file, saved_texts_file):
+   for original_filepath in original_texts_file:
+      saved_filepath = get_corresponding_file(original_filepath)
+      if saved_filepath == '':
+         print("No result text file after roundtrip for " + original_filepath)
+         continue
+
+      with open(original_filepath) as file_1:
+         original_file_text = file_1.readlines()
+
+      with open(saved_filepath) as file_2:
+         saved_file_text = file_2.readlines()
+
+ #     if os.path.exists("./result"):
+ #        os.remove("result")
 
+      for line in difflib.unified_diff(original_file_text, saved_file_text, fromfile=original_filepath, tofile=saved_filepath, lineterm=''):
+         with open("result", "a") as log_file:
+            print(line, file=log_file)
+         log_file.close()
 
-# checks if we missed any text content after saving the file.
-def check_text_contents(original_result_list, saved_result_list):
+# checks if we missed any text content after saving the file. (except a:t, We are comparing them with compare_text function)
+def check_text_contents(original_result_list, saved_result_list, original_file_path):
 
    # detect if we lost or added any text on existing texts of original version.
    for line in original_result_list:
@@ -69,6 +110,10 @@ def check_text_contents(original_result_list, saved_result_list):
       if not bool(text_dict): # check if text context is empty
          continue
       tag = list(line[0].keys())[0] #if there is a text context, find the owner tag.
+
+      if tag == "a:t": # exclude text, we are comparing them seperatly
+         continue
+
       for sline in saved_result_list:
          stag = list(sline[0].keys())[0]
          if stag == tag: # check if saved results has same tag too.
@@ -76,18 +121,22 @@ def check_text_contents(original_result_list, saved_result_list):
             if text_dict != saved_text_dict:
                for key, val in text_dict.items():
                   if key not in saved_text_dict.keys():
-                     print ("We lost %d \"%s\" text in %s tag." % (val, key, tag))
+                     print ("We lost %d \"%s\" text in %s tag in %s." % (val, key, tag, original_file_path))
                   elif val > saved_text_dict[key]:
-                     print ("We lost %d \"%s\" text in %s tag." % (val - saved_text_dict[key], key, tag))
+                     print ("We lost %d \"%s\" text in %s tag in %s." % (val - saved_text_dict[key], key, tag, original_file_path))
                   elif val < saved_text_dict[key]:
-                     print("We added extra %d \"%s\" text in %s tag" % (saved_text_dict[key] - val, key, tag))
+                     print("We added extra %d \"%s\" text in %s tag in %s" % (saved_text_dict[key] - val, key, tag, original_file_path))
 
-   # detect if we add any new text that not existed in original version
+   # detect if we add any new text that not existed in original version (Reverse comparision)
    for line in saved_result_list:
       saved_text_dict = line[3]
       if not bool(saved_text_dict): # check if text context is empty
          continue
       tag = list(line[0].keys())[0] #if there is a text context, find the owner tag.
+
+      if tag == "a:t": # exclude text, we are comparing them seperatly
+         continue
+
       for sline in original_result_list:
          stag = list(sline[0].keys())[0]
          if stag == tag: # check if original results has same tag too.
@@ -95,9 +144,9 @@ def check_text_contents(original_result_list, saved_result_list):
             if saved_text_dict != text_dict:
                for key, val in saved_text_dict.items():
                   if key not in text_dict.keys():
-                     print ("We add extra %d \"%s\" text in %s tag." % (val, key, tag))
+                     print ("We add extra %d \"%s\" text in %s tag in %s." % (val, key, tag, original_file_path))
 
-#reads the file context and create the result list structer from.
+#reads the file context and create the result list structure from.
 # eg res_list[[{},{},{},{}],[{},{},{},{}]...]                                                                 ]
 def create_list_from_result_file(filepath):
    result_list = []
author	Gülşah Köse <gulsah.kose@collabora.com>	2021-06-08 08:50:37 +0300
committer	Gülşah Köse <gulsah.kose@collabora.com>	2021-06-08 08:50:37 +0300
commit	4bf4b28b2d6316a34d0966d1109b76162aa0b336 (patch)
tree	ceae35e7e05ff81e10490ab230f85594348d5cc1
parent	7c03a4c092d9cba10ecb22e7f97aaca851259f1f (diff)