diff options
author | Gülşah Köse <gulsah.kose@collabora.com> | 2021-05-31 16:08:23 +0300 |
---|---|---|
committer | Gülşah Köse <gulsah.kose@collabora.com> | 2021-05-31 16:08:28 +0300 |
commit | 056ebfae35f6725b9089439a7bf868dad48fdd0f (patch) | |
tree | 5371bb683a29e987fc2d0266fd04aed566b6d4d2 | |
parent | 076e9b84985ef81912d5f58da241d790ae17ed33 (diff) |
Fix the use of exist extracted files path and change result output
Tool was counting the text context that consists of whitespaces.
Prevent this, not count that texts as text contexts eg: " "
Change-Id: Ib71123b82082166addd423b734661a158ec2254e
-rwxr-xr-x | bin/ooxml-analyze.py | 38 |
1 files changed, 22 insertions, 16 deletions
diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index 12b9ba590db9..9db39d8c47da 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -8,6 +8,7 @@ def main(argv): inputdir = '' outputdir = '' extracted_files_dir_by_user = '' + extracted_files_dir = '' #read the arguments try: @@ -34,23 +35,27 @@ def main(argv): # use default directory path for extracted ooxml files. extracted_files_dir = os.path.join(outputdir, 'extractedfiles') extract_files(inputdir, extracted_files_dir) - - # create seperate result files for each ooxml document as <document name>.result in output directory - for ext_dir in get_list_of_subdir(extracted_files_dir): - i = ext_dir.rfind('/') - sub_result_name = ext_dir[i+1:] + ".result" - sub_result_list = [] - count_elements(ext_dir, sub_result_list) - sub_result_path = os.path.join(outputdir, sub_result_name) - - # sort the result sub list according to tag names - sub_result_list = sorted(sub_result_list, key=lambda x: list(x[0].keys())[0], reverse=False) - - with open(sub_result_path, "w") as log_file: - pprint.pprint(sub_result_list, log_file) else: # use user defined directory path for extracted ooxml files. - count_elements(extracted_files_dir_by_user, result_list) + extracted_files_dir = extracted_files_dir_by_user + + # create seperate result files for each ooxml document as <document name>.result in output directory + for ext_dir in get_list_of_subdir(extracted_files_dir): + i = ext_dir.rfind('/') + sub_result_name = ext_dir[i+1:] + ".result" + sub_result_list = [] + count_elements(ext_dir, sub_result_list) + sub_result_path = os.path.join(outputdir, sub_result_name) + + # sort the result sub list according to tag names + sub_result_list = sorted(sub_result_list, key=lambda x: list(x[0].keys())[0], reverse=False) + + if os.path.exists(sub_result_path): + os.remove(sub_result_path) + for i in sub_result_list: + with open(sub_result_path, "a") as log_file: + print(i, file=log_file) + log_file.close() # unzip all ooxml files into the given path def extract_files(inputdir, extracted_files_dir): @@ -154,7 +159,8 @@ def count_elements(extracted_files_dir, result_list): else: result_list[tag_idx][2][attr_value] +=1 - if not (str(child.text) == "None"): + # count text contents except consisted of whitespaces. + if not (str(child.text) == "None" or str(child.text).strip()==""): if not child.text in result_list[tag_idx][3].keys(): result_list[tag_idx][3][child.text] = 1 else: |