diff options
author | Xisco Fauli <xiscofauli@libreoffice.org> | 2022-06-08 14:31:33 +0200 |
---|---|---|
committer | Xisco Fauli <xiscofauli@libreoffice.org> | 2022-06-09 12:02:31 +0200 |
commit | 0e1a0ecffa055062a21815ab13eb6e4f8c769b8f (patch) | |
tree | 19f3be8aca6f2463841b01258e1aaf2882f3579c /bin | |
parent | b1d57168b8cecc1713e729c22260feaf33fab29f (diff) |
get-forum-attachments: Add 2 more mso forums
Add login mechanism for them
Also add --config and --outdir arguments to
set the pathes
Change-Id: I641f10396e1f4cf5bdb19da287b1a2962ff4e2ca
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/135495
Tested-by: Jenkins
Reviewed-by: Xisco Fauli <xiscofauli@libreoffice.org>
Diffstat (limited to 'bin')
-rwxr-xr-x | bin/get-forum-attachments.py | 119 |
1 files changed, 90 insertions, 29 deletions
diff --git a/bin/get-forum-attachments.py b/bin/get-forum-attachments.py index 9b967d5a4963..4e46befefa9a 100755 --- a/bin/get-forum-attachments.py +++ b/bin/get-forum-attachments.py @@ -6,18 +6,24 @@ # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. -import requests -from requests.adapters import HTTPAdapter -from requests.packages.urllib3.util.retry import Retry +import argparse +import configparser +import hashlib import magic -import tempfile import os +import requests import shutil +import sys +import tempfile + +from bs4 import BeautifulSoup from attachment_mimetypes import mimetypes from concurrent.futures import ThreadPoolExecutor, as_completed +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry -# https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages -languages = { +forums = { + # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages 'en': "https://forum.openoffice.org/en/forum", 'es': "https://forum.openoffice.org/es/forum", 'fr': "https://forum.openoffice.org/fr/forum", @@ -31,22 +37,54 @@ languages = { 'de': "https://www.openoffice-forum.de", 'de2': "https://www.libreoffice-forum.de", 'de3': "https://de.openoffice.info", + # Others + 'mso-en': "https://www.msofficeforums.com", + 'mso-de': "https://www.ms-office-forum.net/forum", } -def get_attachments_from_url(lang, url): +def do_login(session, url, configFile): + config = configparser.ConfigParser() + + config.read(configFile) + username = config.get('login', 'username') + password = config.get('login', 'password') + resp = session.post(url + '/login.php?do=login', { + 'vb_login_username': username, + 'vb_login_password': '', + 'vb_login_md5password': hashlib.md5(password.encode()).hexdigest(), + 'vb_login_md5password_utf': hashlib.md5(password.encode()).hexdigest(), + 'cookieuser': 1, + 'do': 'login', + 's': '', + 'securitytoken': 'guest' + }) + + if resp.status_code != 200: + return False + + soup = BeautifulSoup(resp.content, 'lxml') + for p in soup.find_all("p"): + if 'Thank you for logging in' in p.get_text(): + return True + elif 'Danke für Ihre Anmeldung' in p.get_text(): + return True + + return False + +def get_attachments_from_url(lang, url, pathes): print("Checking " + url) - startPoint = 0 + startIndex = 0 # Keep the index and resume from there - indexFile = lang + ".index" + indexFile = os.path.join(pathes.outdir, lang + ".index") if os.path.isfile(indexFile): with open(indexFile) as f: - startPoint = int(f.readline().rstrip()) + 1 + startIndex = int(f.readline().rstrip()) + 1 else: if lang == 'hu': - startPoint = 1300 + startIndex = 1300 session = requests.Session() retry = Retry(connect=3, backoff_factor=0.5) @@ -54,9 +92,18 @@ def get_attachments_from_url(lang, url): session.mount('http://', adapter) session.mount('https://', adapter) + if lang.startswith("mso"): + if not do_login(session, url, pathes.config): + print("Can't log in to " + url) + return + invalidCount = 0 - for i in range(startPoint, 999999): - fileUrl = url + "/download/file.php?id=" + str(i) + for i in range(startIndex, 999999): + if lang.startswith("mso"): + fileUrl = url + "/attachment.php?attachmentid=" + str(i) + else: + fileUrl = url + "/download/file.php?id=" + str(i) + h = session.head(fileUrl) header = h.headers content_type = header.get('content-type') @@ -71,34 +118,48 @@ def get_attachments_from_url(lang, url): else: invalidCount = 0 - if content_type == 'application/octet-stream': - r = session.get(fileUrl, allow_redirects=True) - with tempfile.NamedTemporaryFile() as tmp: - tmp.write(r.content) - mimetype = magic.from_file(tmp.name, mime=True) - if mimetype in mimetypes: - suffix = mimetypes[mimetype] - try: - os.mkdir(suffix) - except: - pass + r = session.get(fileUrl, allow_redirects=True) + with tempfile.NamedTemporaryFile() as tmp: + tmp.write(r.content) + mimetype = magic.from_file(tmp.name, mime=True) + if mimetype in mimetypes: + suffix = mimetypes[mimetype] + suffixDir = os.path.join(pathes.outdir, suffix) + try: + os.mkdir(suffixDir) + except: + pass - download = suffix + '/' + "forum-" + lang + '-' + str(i) + '.' + suffix + download = os.path.join(suffixDir, + "forum-" + lang + '-' + str(i) + '.' + suffix) - print("Downloading as " + download) - shutil.copy(tmp.name, download) + print("Downloading as " + download) + shutil.copy(tmp.name, download) # Save the index with open(indexFile, 'w') as f: f.write(str(i)) if __name__ == '__main__': + parser = argparse.ArgumentParser() + + parser.add_argument('--outdir', action='store', dest="outdir", required=True) + parser.add_argument('--config', action="store", dest="config", required=True) + + pathes = parser.parse_args() + + if not os.path.exists(pathes.outdir) or os.path.isfile(pathes.outdir): + print("Outdir folder doesn't exists") + sys.exit(1) + elif not os.path.exists(pathes.config) or not os.path.isfile(pathes.config): + print("Config file doesn't exists") + sys.exit(1) processes = [] # 10 at a time seems to work fine with ThreadPoolExecutor(max_workers=10) as executor: - for lang, url in languages.items(): - processes.append(executor.submit(get_attachments_from_url, lang, url)) + for lang, url in forums.items(): + processes.append(executor.submit(get_attachments_from_url, lang, url, pathes)) for task in as_completed(processes): result = task.result() |