summaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
authorXisco Fauli <xiscofauli@libreoffice.org>2022-06-29 14:22:55 +0200
committerCaolán McNamara <caolanm@redhat.com>2022-07-19 11:39:24 +0200
commitbb9d618390c8a498ce6348d7ed77235165cede7b (patch)
treefe4ce9ebb4807adc9199aa21058be48d41f7a01f /bin
parent0a2d22ceac6236fc999db368cc6201c6153dd71b (diff)
get-forum-attachments: Add 2 more mso forums
Change-Id: I5134ec5e39e398544764e4e50b6b50280759151d Reviewed-on: https://gerrit.libreoffice.org/c/core/+/136621 Tested-by: Jenkins Tested-by: Caolán McNamara <caolanm@redhat.com> Reviewed-by: Caolán McNamara <caolanm@redhat.com>
Diffstat (limited to 'bin')
-rwxr-xr-xbin/get-forum-attachments.py68
1 files changed, 36 insertions, 32 deletions
diff --git a/bin/get-forum-attachments.py b/bin/get-forum-attachments.py
index 4e46befefa9a..18e9259d2133 100755
--- a/bin/get-forum-attachments.py
+++ b/bin/get-forum-attachments.py
@@ -24,25 +24,34 @@ from requests.packages.urllib3.util.retry import Retry
forums = {
# https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
- 'en': "https://forum.openoffice.org/en/forum",
- 'es': "https://forum.openoffice.org/es/forum",
- 'fr': "https://forum.openoffice.org/fr/forum",
- 'hu': "https://forum.openoffice.org/hu/forum",
- 'it': "https://forum.openoffice.org/it/forum",
- 'ja': "https://forum.openoffice.org/ja/forum",
- 'nl': "https://forum.openoffice.org/nl/forum",
- 'pl': "https://forum.openoffice.org/pl/forum",
- 'vi': "https://forum.openoffice.org/vi/forum",
- 'tr': "https://forum.libreoffice.org.tr",
- 'de': "https://www.openoffice-forum.de",
- 'de2': "https://www.libreoffice-forum.de",
- 'de3': "https://de.openoffice.info",
+ 'en': ["https://forum.openoffice.org/en/forum", False, 0],
+ 'es': ["https://forum.openoffice.org/es/forum", False, 0],
+ 'fr': ["https://forum.openoffice.org/fr/forum", False, 0],
+ 'hu': ["https://forum.openoffice.org/hu/forum", False, 1300],
+ 'it': ["https://forum.openoffice.org/it/forum", False, 0],
+ 'ja': ["https://forum.openoffice.org/ja/forum", False, 0],
+ 'nl': ["https://forum.openoffice.org/nl/forum", False, 0],
+ 'pl': ["https://forum.openoffice.org/pl/forum", False, 0],
+ 'vi': ["https://forum.openoffice.org/vi/forum", False, 0],
+ 'tr': ["https://forum.libreoffice.org.tr", False, 0],
+ 'de': ["https://www.openoffice-forum.de", False, 0],
+ 'de2': ["https://www.libreoffice-forum.de", False, 0],
+ 'de3': ["https://de.openoffice.info", False, 0],
# Others
- 'mso-en': "https://www.msofficeforums.com",
- 'mso-de': "https://www.ms-office-forum.net/forum",
+ 'mso-de': ["https://www.ms-office-forum.net/forum", True, 0],
+ 'mso-en': ["https://www.msofficeforums.com", True, 0],
+ 'mso-en2': ["https://www.excelguru.ca/forums", False, 0],
+ 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100],
+ # lang : [url, doLogin, startIndex]
}
-def do_login(session, url, configFile):
+def get_attachment_query(lang):
+ if lang.startswith("mso"):
+ return "/attachment.php?attachmentid="
+ else:
+ return "/download/file.php?id="
+
+def login(session, url, configFile):
config = configparser.ConfigParser()
config.read(configFile)
@@ -71,20 +80,18 @@ def do_login(session, url, configFile):
return False
-def get_attachments_from_url(lang, url, pathes):
+def get_attachments_from_url(lang, config, pathes):
+ url = config[0]
+ doLogin = config[1]
+ startIndex = config[2]
print("Checking " + url)
- startIndex = 0
-
# Keep the index and resume from there
indexFile = os.path.join(pathes.outdir, lang + ".index")
if os.path.isfile(indexFile):
with open(indexFile) as f:
startIndex = int(f.readline().rstrip()) + 1
- else:
- if lang == 'hu':
- startIndex = 1300
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
@@ -92,17 +99,14 @@ def get_attachments_from_url(lang, url, pathes):
session.mount('http://', adapter)
session.mount('https://', adapter)
- if lang.startswith("mso"):
- if not do_login(session, url, pathes.config):
+ if doLogin:
+ if not login(session, url, pathes.config):
print("Can't log in to " + url)
return
invalidCount = 0
for i in range(startIndex, 999999):
- if lang.startswith("mso"):
- fileUrl = url + "/attachment.php?attachmentid=" + str(i)
- else:
- fileUrl = url + "/download/file.php?id=" + str(i)
+ fileUrl = url + get_attachment_query(lang) + str(i)
h = session.head(fileUrl)
header = h.headers
@@ -111,8 +115,8 @@ def get_attachments_from_url(lang, url, pathes):
# Let's assume this is an invalid file link
invalidCount += 1
- # Let's assume, if we get 100 invalid files, that there are no more files
- if invalidCount == 100:
+ # Let's assume, if we get 200 invalid files, that there are no more files
+ if invalidCount == 200:
print("No more attachments found in " + url)
break
else:
@@ -158,8 +162,8 @@ if __name__ == '__main__':
processes = []
# 10 at a time seems to work fine
with ThreadPoolExecutor(max_workers=10) as executor:
- for lang, url in forums.items():
- processes.append(executor.submit(get_attachments_from_url, lang, url, pathes))
+ for lang, config in forums.items():
+ processes.append(executor.submit(get_attachments_from_url, lang, config, pathes))
for task in as_completed(processes):
result = task.result()