diff options
author | Gülşah Köse <gulsah.kose@collabora.com> | 2021-05-26 18:25:11 +0300 |
---|---|---|
committer | Gülşah Köse <gulsah.kose@collabora.com> | 2021-05-26 18:25:11 +0300 |
commit | fc03e6b942a9170bda5964f95893c18123b340e4 (patch) | |
tree | 85a610c8005336625b2410ce8cc35652576ac23d | |
parent | a8b521dc0f8e810f97630551406ccd8d1590371f (diff) |
Export the accepted files part as function. And sort the sub result list
Change-Id: I9b5c003b6363ac50cf7c838cc4e954c14ef935de
-rwxr-xr-x | bin/ooxml-analyze.py | 90 |
1 files changed, 57 insertions, 33 deletions
diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py index 8dbfe8cacd0b..12b9ba590db9 100755 --- a/bin/ooxml-analyze.py +++ b/bin/ooxml-analyze.py @@ -42,6 +42,10 @@ def main(argv): sub_result_list = [] count_elements(ext_dir, sub_result_list) sub_result_path = os.path.join(outputdir, sub_result_name) + + # sort the result sub list according to tag names + sub_result_list = sorted(sub_result_list, key=lambda x: list(x[0].keys())[0], reverse=False) + with open(sub_result_path, "w") as log_file: pprint.pprint(sub_result_list, log_file) else: @@ -89,6 +93,23 @@ def replace_namespace_with_alias(filename, element): element = element.replace("{" + element_ns + "}", "") return element +def is_file_in_accepted_files(filename): + if(filename.endswith("[Content_Types].xml") or \ + filename.endswith("docProps/custom.xml") or \ + filename.endswith("docProps/app.xml") or \ + filename.endswith("presentation.xml") or \ + filename.endswith("viewProps.xml") or \ + filename.endswith("tableStyles.xml") or \ + filename.endswith("presProps.xml") or \ + "ppt/slideLayouts" in filename or \ + "ppt/slideMasters" in filename or \ + "ppt/theme" in filename or \ + filename.endswith("docProps/core.xml") or not \ + filename.endswith(".xml")): + return False + + return True + # counts tags, attribute names and values of xmls def count_elements(extracted_files_dir, result_list): @@ -101,40 +122,43 @@ def count_elements(extracted_files_dir, result_list): # parse xmls and count elements for xmlfile in list_of_files: - if(xmlfile.endswith(".xml")): - tree = ET.parse(xmlfile) - root = tree.getroot() - - # start to count - for child in root.iter(): - tag = replace_namespace_with_alias(xmlfile, child.tag) - tag_idx = get_index_of_tag(tag, result_list) - - # count tags - if (tag_idx == -1): - tmp_list = [{tag: 1},{},{},{}] - result_list.append(tmp_list) + if not is_file_in_accepted_files(xmlfile): + continue + + print(xmlfile) + tree = ET.parse(xmlfile) + root = tree.getroot() + + # start to count + for child in root.iter(): + tag = replace_namespace_with_alias(xmlfile, child.tag) + tag_idx = get_index_of_tag(tag, result_list) + + # count tags + if (tag_idx == -1): + tmp_list = [{tag: 1},{},{},{}] + result_list.append(tmp_list) + else: + result_list[tag_idx][0][tag] += 1 + + # count attribute names and values of current tag + for attr_name, attr_value in child.attrib.items(): + attr_name = replace_namespace_with_alias(xmlfile, attr_name) + if not attr_name in result_list[tag_idx][1].keys(): + result_list[tag_idx][1][attr_name] = 1 + else: + result_list[tag_idx][1][attr_name] +=1 + + if not attr_value in result_list[tag_idx][2].keys(): + result_list[tag_idx][2][attr_value] = 1 + else: + result_list[tag_idx][2][attr_value] +=1 + + if not (str(child.text) == "None"): + if not child.text in result_list[tag_idx][3].keys(): + result_list[tag_idx][3][child.text] = 1 else: - result_list[tag_idx][0][tag] += 1 - - # count attribute names and values of current tag - for attr_name, attr_value in child.attrib.items(): - attr_name = replace_namespace_with_alias(xmlfile, attr_name) - if not attr_name in result_list[tag_idx][1].keys(): - result_list[tag_idx][1][attr_name] = 1 - else: - result_list[tag_idx][1][attr_name] +=1 - - if not attr_value in result_list[tag_idx][2].keys(): - result_list[tag_idx][2][attr_value] = 1 - else: - result_list[tag_idx][2][attr_value] +=1 - - if not (str(child.text) == "None"): - if not child.text in result_list[tag_idx][3].keys(): - result_list[tag_idx][3][child.text] = 1 - else: - result_list[tag_idx][3][child.text] += 1 + result_list[tag_idx][3][child.text] += 1 # gets the position of "tag" element in result list. If element is not exist, # return -1 that points the last index of the list. |