diff options
author | Gülşah Köse <gulsah.kose@collabora.com> | 2021-05-24 01:05:37 +0300 |
---|---|---|
committer | Gülşah Köse <gulsah.kose@collabora.com> | 2021-05-24 10:00:29 +0300 |
commit | d53433ed5f10315e26cb709d6295fc5317c453cb (patch) | |
tree | 5fdbdafe113ca7c578d0b65bd756c5e33c2c5025 | |
parent | 816c2bb27bde42700fd0154cc250e9dcfd5d744e (diff) |
Inital commit of ooxml analyze tool
* Reads input ooxml files given by the user
* Extracts the files into output directory given by user
* Counts tags, attributes, and values.
* Holds the result in result_list structure.
result list is a list that contains a nested list for each tag
Each list holds four dictionaries,
- first one for tags and counts
- second one for attributes and counts,
- third one for values and counts
- last one for the plain texts between tags
* Prints to result list.
* As an option it can skip extract step and read extracted files
path from the user.
Change-Id: I2d942984cad118fc479c5b04acf8b8a72c519807
-rwxr-xr-x | bin/ooxml-analyze.py | 142 |
1 files changed, 142 insertions, 0 deletions
diff --git a/bin/ooxml-analyze.py b/bin/ooxml-analyze.py new file mode 100755 index 000000000000..efc44bbfa32c --- /dev/null +++ b/bin/ooxml-analyze.py @@ -0,0 +1,142 @@ +#!/usr/bin/python + +import sys, getopt, os, shutil, pprint +import xml.etree.ElementTree as ET +from zipfile import ZipFile + +def main(argv): + inputdir = '' + outputdir = '' + extracted_files_dir_by_user = '' + + #read the arguments + try: + opts, args = getopt.getopt(argv,"hi:o:e:",["idir=","odir="]) + except getopt.GetoptError: + print ('analyze.py -i <inputdir> -o <outputdir>') + sys.exit(2) + + for opt, arg in opts: + if opt == '-h': + print ('analyze.py -i <inputdir> -o <outputdir>') + sys.exit() + elif opt == '-e': + extracted_files_dir_by_user = arg + elif opt in ("-i", "--idir"): + inputdir = arg + elif opt in ("-o", "--odir"): + outputdir = arg + + # holds the result structer of analyze + result_list = [] + + if(extracted_files_dir_by_user == ''): + # use default directory path for extracted ooxml files. + extracted_files_dir = os.path.join(outputdir, 'extractedfiles') + + extract_files(inputdir, extracted_files_dir) + count_elements(extracted_files_dir, result_list) + else: + # use user defined directory path for extracted ooxml files. + count_elements(extracted_files_dir_by_user, result_list) + + pprint.pprint(result_list) + +# unzip all ooxml files into the given path +def extract_files(inputdir, extracted_files_dir): + + # clean extracted files directory firstly + if(os.path.exists(extracted_files_dir)): + shutil.rmtree(extracted_files_dir) + + # holds directory names for each ooxml document in extracted files dir. + counter = 1 + + # unzip files into the extracted files directory + for filename in os.listdir(inputdir): + if (filename.endswith(".pptx") or \ + filename.endswith(".docx") or \ + filename.endswith(".xlsx")) and not \ + filename.startswith("~"): + + filepath = os.path.join(inputdir, filename) + extracted_file_path = os.path.join(extracted_files_dir, str(counter)) + + with ZipFile(filepath) as zipObj: + zipObj.extractall(extracted_file_path) + + counter += 1 + else: + continue + +# counts tags, attribute names and values of xmls +def count_elements(extracted_files_dir, result_list): + + # make sure if extracted files directory exist + if not (os.path.exists(extracted_files_dir)): + print("Extracted files directory is not exist") + return + + list_of_files = get_list_of_files(extracted_files_dir) + + # parse xmls and count elements + for xmlfile in list_of_files: + if(xmlfile.endswith(".xml")): + tree = ET.parse(xmlfile) + root = tree.getroot() + + # start to count + for child in root.iter(): + tag = str(child.tag) + tag_idx = get_index_of_tag(tag, result_list) + + # count tags + if (tag_idx == -1): + tmp_list = [{tag: 1},{},{},{}] + result_list.append(tmp_list) + else: + result_list[tag_idx][0][tag] += 1 + + # count attribute names and values of current tag + for attr_name, attr_value in child.attrib.items(): + if not attr_name in result_list[tag_idx][1].keys(): + result_list[tag_idx][1][attr_name] = 1 + else: + result_list[tag_idx][1][attr_name] +=1 + + if not attr_value in result_list[tag_idx][2].keys(): + result_list[tag_idx][2][attr_value] = 1 + else: + result_list[tag_idx][2][attr_value] +=1 + + if not (str(child.text) == "None"): + if not child.text in result_list[tag_idx][3].keys(): + result_list[tag_idx][3][child.text] = 1 + else: + result_list[tag_idx][3][child.text] += 1 + +# gets the position of "tag" element in result list. If element is not exist, +# return -1 that points the last index of the list. +def get_index_of_tag(tag, result_list): + for idx, tag_list in enumerate(result_list): + if tag in tag_list[0].keys(): + return idx + return -1 + +# list all xmls in extracted files directory +def get_list_of_files(directory_name): + + list_of_file = os.listdir(directory_name) + all_files = list() + + for filename in list_of_file: + full_path = os.path.join(directory_name, filename) + if os.path.isdir(full_path): + all_files = all_files + get_list_of_files(full_path) + else: + all_files.append(full_path) + + return all_files + +if __name__ == "__main__": + main(sys.argv[1:]) |