diff options
author | Xisco Fauli <xiscofauli@libreoffice.org> | 2022-12-12 11:39:46 +0100 |
---|---|---|
committer | Xisco Fauli <xiscofauli@libreoffice.org> | 2022-12-12 13:08:48 +0000 |
commit | 7bd768846aa13dfa98fc0fb480d1818eacb47bf4 (patch) | |
tree | 2a9705e91cfadf09d06ef9e8ad94c3bde1fe7c1c /bin | |
parent | c0f61ec33be9b76116495907e4874aaae5990088 (diff) |
get-forum-attachment: Add optional get-file argument
to help to download a specific file
e.g: --get-file forum-mso-en4-93354.xls
Change-Id: Id6a2388a3388d7a332a0a7e1bd8042094d835ca7
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/143984
Tested-by: Xisco Fauli <xiscofauli@libreoffice.org>
Reviewed-by: Xisco Fauli <xiscofauli@libreoffice.org>
Diffstat (limited to 'bin')
-rwxr-xr-x | bin/get-forum-attachments.py | 90 |
1 files changed, 64 insertions, 26 deletions
diff --git a/bin/get-forum-attachments.py b/bin/get-forum-attachments.py index 4300778e4fda..74827fff929c 100755 --- a/bin/get-forum-attachments.py +++ b/bin/get-forum-attachments.py @@ -43,15 +43,23 @@ forums = { 'mso-en2': ["https://www.excelguru.ca/forums", False, 0], 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100], 'mso-en4': ["https://www.excelforum.com", True, 5100], - # lang : [url, doLogin, startIndex] + # forum : [url, doLogin, startIndex] } -def get_attachment_query(lang): - if lang.startswith("mso"): +def get_attachment_query(forum): + if forum.startswith("mso"): return "/attachment.php?attachmentid=" else: return "/download/file.php?id=" +def createSession(): + session = requests.Session() + retry = Retry(connect=3, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + return session + def login(session, url, configFile): config = configparser.ConfigParser() @@ -81,7 +89,7 @@ def login(session, url, configFile): return False -def get_attachments_from_url(lang, config, pathes): +def get_attachments_from_url(forum, config, args): url = config[0] doLogin = config[1] startIndex = config[2] @@ -89,25 +97,21 @@ def get_attachments_from_url(lang, config, pathes): print("Checking " + url) # Keep the index and resume from there - indexFile = os.path.join(pathes.outdir, lang + ".index") + indexFile = os.path.join(args.outdir, forum + ".index") if os.path.isfile(indexFile): with open(indexFile) as f: startIndex = int(f.readline().rstrip()) + 1 - session = requests.Session() - retry = Retry(connect=3, backoff_factor=0.5) - adapter = HTTPAdapter(max_retries=retry) - session.mount('http://', adapter) - session.mount('https://', adapter) + session = createSession() if doLogin: - if not login(session, url, pathes.config): + if not login(session, url, args.config): print("Can't log in to " + url) return invalidCount = 0 for i in range(startIndex, 999999): - fileUrl = url + get_attachment_query(lang) + str(i) + fileUrl = url + get_attachment_query(forum) + str(i) h = session.head(fileUrl) header = h.headers @@ -129,14 +133,14 @@ def get_attachments_from_url(lang, config, pathes): mimetype = magic.from_file(tmp.name, mime=True) if mimetype in mimetypes: suffix = mimetypes[mimetype] - suffixDir = os.path.join(pathes.outdir, suffix) + suffixDir = os.path.join(args.outdir, suffix) try: os.mkdir(suffixDir) except: pass download = os.path.join(suffixDir, - "forum-" + lang + '-' + str(i) + '.' + suffix) + "forum-" + forum + '-' + str(i) + '.' + suffix) print("Downloading as " + download) shutil.copy(tmp.name, download) @@ -150,23 +154,57 @@ if __name__ == '__main__': parser.add_argument('--outdir', action='store', dest="outdir", required=True) parser.add_argument('--config', action="store", dest="config", required=True) + parser.add_argument('--get-file', action="store", dest="fileName", required=False) - pathes = parser.parse_args() + args = parser.parse_args() - if not os.path.exists(pathes.outdir) or os.path.isfile(pathes.outdir): + if not os.path.exists(args.outdir) or os.path.isfile(args.outdir): print("Outdir folder doesn't exists") sys.exit(1) - elif not os.path.exists(pathes.config) or not os.path.isfile(pathes.config): + elif not os.path.exists(args.config) or not os.path.isfile(args.config): print("Config file doesn't exists") sys.exit(1) - processes = [] - # by default, 10 at a time seems to work fine - with ThreadPoolExecutor(max_workers=int(os.environ.get('PARALLELISM', 10))) as executor: - for lang, config in forums.items(): - processes.append(executor.submit(get_attachments_from_url, lang, config, pathes)) + if not args.fileName: + processes = [] + # by default, 10 at a time seems to work fine + with ThreadPoolExecutor(max_workers=int(os.environ.get('PARALLELISM', 10))) as executor: + for forum, config in forums.items(): + processes.append(executor.submit(get_attachments_from_url, forum, config, args)) + + for task in as_completed(processes): + result = task.result() + if result: + print(result) + else: + fileNameSplit = args.fileName.split("-") + if fileNameSplit[0] != "forum" or (len(fileNameSplit) != 3 and len(fileNameSplit) != 4): + print("Incorrect file name") + sys.exit(1) + + forum = fileNameSplit[1] + fileId = fileNameSplit[2] + if fileNameSplit[1] == "mso": + forum += "-" + fileNameSplit[2] + fileId = fileNameSplit[3] + + url = forums[forum][0] + fileUrl = url + get_attachment_query(forum) + fileId.split(".")[0] + + session = createSession() + + doLogin = forums[forum][1] + if doLogin: + if not login(session, url, args.config): + print("Can't log in to " + url) + sys.exit(1) + + r = session.get(fileUrl, allow_redirects=True) + with tempfile.NamedTemporaryFile() as tmp: + tmp.write(r.content) + + download = os.path.join(args.outdir, args.fileName) + + print("Downloading " + fileUrl + " as " + download) + shutil.copy(tmp.name, download) - for task in as_completed(processes): - result = task.result() - if result: - print(result) |