summaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
authorXisco Fauli <xiscofauli@libreoffice.org>2022-06-08 14:31:33 +0200
committerXisco Fauli <xiscofauli@libreoffice.org>2022-06-09 12:02:31 +0200
commit0e1a0ecffa055062a21815ab13eb6e4f8c769b8f (patch)
tree19f3be8aca6f2463841b01258e1aaf2882f3579c /bin
parentb1d57168b8cecc1713e729c22260feaf33fab29f (diff)
get-forum-attachments: Add 2 more mso forums
Add login mechanism for them Also add --config and --outdir arguments to set the pathes Change-Id: I641f10396e1f4cf5bdb19da287b1a2962ff4e2ca Reviewed-on: https://gerrit.libreoffice.org/c/core/+/135495 Tested-by: Jenkins Reviewed-by: Xisco Fauli <xiscofauli@libreoffice.org>
Diffstat (limited to 'bin')
-rwxr-xr-xbin/get-forum-attachments.py119
1 files changed, 90 insertions, 29 deletions
diff --git a/bin/get-forum-attachments.py b/bin/get-forum-attachments.py
index 9b967d5a4963..4e46befefa9a 100755
--- a/bin/get-forum-attachments.py
+++ b/bin/get-forum-attachments.py
@@ -6,18 +6,24 @@
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-import requests
-from requests.adapters import HTTPAdapter
-from requests.packages.urllib3.util.retry import Retry
+import argparse
+import configparser
+import hashlib
import magic
-import tempfile
import os
+import requests
import shutil
+import sys
+import tempfile
+
+from bs4 import BeautifulSoup
from attachment_mimetypes import mimetypes
from concurrent.futures import ThreadPoolExecutor, as_completed
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
-# https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
-languages = {
+forums = {
+ # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
'en': "https://forum.openoffice.org/en/forum",
'es': "https://forum.openoffice.org/es/forum",
'fr': "https://forum.openoffice.org/fr/forum",
@@ -31,22 +37,54 @@ languages = {
'de': "https://www.openoffice-forum.de",
'de2': "https://www.libreoffice-forum.de",
'de3': "https://de.openoffice.info",
+ # Others
+ 'mso-en': "https://www.msofficeforums.com",
+ 'mso-de': "https://www.ms-office-forum.net/forum",
}
-def get_attachments_from_url(lang, url):
+def do_login(session, url, configFile):
+ config = configparser.ConfigParser()
+
+ config.read(configFile)
+ username = config.get('login', 'username')
+ password = config.get('login', 'password')
+ resp = session.post(url + '/login.php?do=login', {
+ 'vb_login_username': username,
+ 'vb_login_password': '',
+ 'vb_login_md5password': hashlib.md5(password.encode()).hexdigest(),
+ 'vb_login_md5password_utf': hashlib.md5(password.encode()).hexdigest(),
+ 'cookieuser': 1,
+ 'do': 'login',
+ 's': '',
+ 'securitytoken': 'guest'
+ })
+
+ if resp.status_code != 200:
+ return False
+
+ soup = BeautifulSoup(resp.content, 'lxml')
+ for p in soup.find_all("p"):
+ if 'Thank you for logging in' in p.get_text():
+ return True
+ elif 'Danke für Ihre Anmeldung' in p.get_text():
+ return True
+
+ return False
+
+def get_attachments_from_url(lang, url, pathes):
print("Checking " + url)
- startPoint = 0
+ startIndex = 0
# Keep the index and resume from there
- indexFile = lang + ".index"
+ indexFile = os.path.join(pathes.outdir, lang + ".index")
if os.path.isfile(indexFile):
with open(indexFile) as f:
- startPoint = int(f.readline().rstrip()) + 1
+ startIndex = int(f.readline().rstrip()) + 1
else:
if lang == 'hu':
- startPoint = 1300
+ startIndex = 1300
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
@@ -54,9 +92,18 @@ def get_attachments_from_url(lang, url):
session.mount('http://', adapter)
session.mount('https://', adapter)
+ if lang.startswith("mso"):
+ if not do_login(session, url, pathes.config):
+ print("Can't log in to " + url)
+ return
+
invalidCount = 0
- for i in range(startPoint, 999999):
- fileUrl = url + "/download/file.php?id=" + str(i)
+ for i in range(startIndex, 999999):
+ if lang.startswith("mso"):
+ fileUrl = url + "/attachment.php?attachmentid=" + str(i)
+ else:
+ fileUrl = url + "/download/file.php?id=" + str(i)
+
h = session.head(fileUrl)
header = h.headers
content_type = header.get('content-type')
@@ -71,34 +118,48 @@ def get_attachments_from_url(lang, url):
else:
invalidCount = 0
- if content_type == 'application/octet-stream':
- r = session.get(fileUrl, allow_redirects=True)
- with tempfile.NamedTemporaryFile() as tmp:
- tmp.write(r.content)
- mimetype = magic.from_file(tmp.name, mime=True)
- if mimetype in mimetypes:
- suffix = mimetypes[mimetype]
- try:
- os.mkdir(suffix)
- except:
- pass
+ r = session.get(fileUrl, allow_redirects=True)
+ with tempfile.NamedTemporaryFile() as tmp:
+ tmp.write(r.content)
+ mimetype = magic.from_file(tmp.name, mime=True)
+ if mimetype in mimetypes:
+ suffix = mimetypes[mimetype]
+ suffixDir = os.path.join(pathes.outdir, suffix)
+ try:
+ os.mkdir(suffixDir)
+ except:
+ pass
- download = suffix + '/' + "forum-" + lang + '-' + str(i) + '.' + suffix
+ download = os.path.join(suffixDir,
+ "forum-" + lang + '-' + str(i) + '.' + suffix)
- print("Downloading as " + download)
- shutil.copy(tmp.name, download)
+ print("Downloading as " + download)
+ shutil.copy(tmp.name, download)
# Save the index
with open(indexFile, 'w') as f:
f.write(str(i))
if __name__ == '__main__':
+ parser = argparse.ArgumentParser()
+
+ parser.add_argument('--outdir', action='store', dest="outdir", required=True)
+ parser.add_argument('--config', action="store", dest="config", required=True)
+
+ pathes = parser.parse_args()
+
+ if not os.path.exists(pathes.outdir) or os.path.isfile(pathes.outdir):
+ print("Outdir folder doesn't exists")
+ sys.exit(1)
+ elif not os.path.exists(pathes.config) or not os.path.isfile(pathes.config):
+ print("Config file doesn't exists")
+ sys.exit(1)
processes = []
# 10 at a time seems to work fine
with ThreadPoolExecutor(max_workers=10) as executor:
- for lang, url in languages.items():
- processes.append(executor.submit(get_attachments_from_url, lang, url))
+ for lang, url in forums.items():
+ processes.append(executor.submit(get_attachments_from_url, lang, url, pathes))
for task in as_completed(processes):
result = task.result()