diff options
author | Xisco Fauli <xiscofauli@libreoffice.org> | 2022-06-06 17:28:57 +0200 |
---|---|---|
committer | Xisco Fauli <xiscofauli@libreoffice.org> | 2022-06-07 20:26:13 +0200 |
commit | 8b8e9d3126d4232c6c13e6059ab3542a521251d8 (patch) | |
tree | 330b8f6e55124605729bb03ec4fb1a3b23184ba7 /bin | |
parent | 8741fd0e0ae9e346de2e09887f0668b831c9b48b (diff) |
bin: Add script to get attachments from OO forums
Testing it locally, I could download 52.000 documents
Reuse mimetypes dictionary from get-bugzilla-attachments-by-mimetype
by putting it into an external file
Change-Id: I875d90f6119c3c3bdfea6a0efd3bbc8c5be1eb63
Reviewed-on: https://gerrit.libreoffice.org/c/core/+/135457
Tested-by: Jenkins
Reviewed-by: Xisco Fauli <xiscofauli@libreoffice.org>
Diffstat (limited to 'bin')
-rw-r--r-- | bin/attachment_mimetypes.py | 157 | ||||
-rwxr-xr-x | bin/get-bugzilla-attachments-by-mimetype | 158 | ||||
-rwxr-xr-x | bin/get-forum-attachments.py | 106 |
3 files changed, 264 insertions, 157 deletions
diff --git a/bin/attachment_mimetypes.py b/bin/attachment_mimetypes.py new file mode 100644 index 000000000000..ede5fcb39fb9 --- /dev/null +++ b/bin/attachment_mimetypes.py @@ -0,0 +1,157 @@ +mimetypes = { +# ODF + 'application/vnd.oasis.opendocument.base': 'odb', + 'application/vnd.oasis.opendocument.database': 'odb', + 'application/vnd.oasis.opendocument.chart': 'odc', + 'application/vnd.oasis.opendocument.chart-template': 'otc', + 'application/vnd.oasis.opendocument.formula': 'odf', + 'application/vnd.oasis.opendocument.formula-template': 'otf', + 'application/vnd.oasis.opendocument.graphics': 'odg', + 'application/vnd.oasis.opendocument.graphics-template': 'otg', + 'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg', + 'application/vnd.oasis.opendocument.presentation': 'odp', + 'application/vnd.oasis.opendocument.presentation-template': 'otp', + 'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp', + 'application/vnd.oasis.opendocument.spreadsheet': 'ods', + 'application/vnd.oasis.opendocument.spreadsheet-template': 'ots', + 'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods', + 'application/vnd.oasis.opendocument.text': 'odt', + 'application/vnd.oasis.opendocument.text-flat-xml': 'fodt', + 'application/vnd.oasis.opendocument.text-master': 'odm', + 'application/vnd.oasis.opendocument.text-template': 'ott', + 'application/vnd.oasis.opendocument.text-master-template': 'otm', + 'application/vnd.oasis.opendocument.text-web': 'oth', +# OOo XML + 'application/vnd.sun.xml.base': 'odb', + 'application/vnd.sun.xml.calc': 'sxc', + 'application/vnd.sun.xml.calc.template': 'stc', + 'application/vnd.sun.xml.chart': 'sxs', + 'application/vnd.sun.xml.draw': 'sxd', + 'application/vnd.sun.xml.draw.template': 'std', + 'application/vnd.sun.xml.impress': 'sxi', + 'application/vnd.sun.xml.impress.template': 'sti', + 'application/vnd.sun.xml.math': 'sxm', + 'application/vnd.sun.xml.writer': 'sxw', + 'application/vnd.sun.xml.writer.global': 'sxg', + 'application/vnd.sun.xml.writer.template': 'stw', + 'application/vnd.sun.xml.writer.web': 'stw', +# MSO + 'application/rtf': 'rtf', + 'text/rtf': 'rtf', + 'application/msword': 'doc', + 'application/vnd.ms-powerpoint': 'ppt', + 'application/vnd.ms-excel': 'xls', + 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb', + 'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm', + 'application/vnd.ms-excel.template.macroEnabled.12': 'xltm', + 'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm', + 'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm', + 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm', + 'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm', + 'application/vnd.ms-word.document.macroEnabled.12': 'docm', + 'application/vnd.ms-word.template.macroEnabled.12': 'dotm', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx', + 'application/vnd.openxmlformats-officedocument.presentationml.template': 'potx', + 'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx', + 'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx', + 'application/vnd.visio': 'vsd', + 'application/visio.drawing': 'vsd', + 'application/vnd.visio2013': 'vsdx', + 'application/vnd.visio.xml': 'vdx', + 'application/x-mspublisher': 'pub', +#WPS Office + 'application/wps-office.doc': 'doc', + 'application/wps-office.docx': 'docx', + 'application/wps-office.xls': 'xls', + 'application/wps-office.xlsx': 'xlsx', + 'application/wps-office.ppt': 'ppt', + 'application/wps-office.pptx': 'pptx', +# W3C + 'application/xhtml+xml': 'xhtml', + 'application/mathml+xml': 'mml', + 'text/html': 'html', + 'application/docbook+xml': 'docbook', +# misc + 'text/csv': 'csv', + 'text/spreadsheet': 'slk', + 'application/x-qpro': 'qpro', + 'application/x-dbase': 'dbf', + 'application/vnd.corel-draw': 'cdr', + 'application/vnd.lotus-wordpro': 'lwp', + 'application/vnd.lotus-1-2-3': 'wks', + 'application/vnd.wordperfect': 'wpd', + 'application/wordperfect5.1': 'wpd', + 'application/vnd.ms-works': 'wps', + 'application/clarisworks' : 'cwk', + 'application/macwriteii' : 'mw', + 'application/vnd.apple.keynote': 'key', + 'application/vnd.apple.numbers': 'numbers', + 'application/vnd.apple.pages': 'pages', + 'application/x-iwork-keynote-sffkey': 'key', + 'application/x-iwork-numbers-sffnumbers': 'numbers', + 'application/x-iwork-pages-sffpages': 'pages', + 'application/x-hwp': 'hwp', + 'application/x-aportisdoc': 'pdb', + 'application/prs.plucker' : 'pdb_plucker', + 'application/vnd.palm' : 'pdb_palm', + 'application/x-sony-bbeb' : 'lrf', + 'application/x-pocket-word': 'psw', + 'application/x-t602': '602', + 'application/x-fictionbook+xml': 'fb2', + 'application/x-abiword': 'abw', + 'application/x-pagemaker': 'pmd', + 'application/x-gnumeric': 'gnumeric', + 'application/vnd.stardivision.calc': 'sdc', + 'application/vnd.stardivision.draw': 'sda', + 'application/vnd.stardivision.writer': 'sdw', + 'application/x-starcalc': 'sdc', + 'application/x-stardraw': 'sdd', + 'application/x-starwriter': 'sdw', +# relatively uncommon image mimetypes + 'image/x-freehand': 'fh', + 'image/cgm': 'cgm', + 'image/tif': 'tiff', + 'image/tiff': 'tiff', + 'image/vnd.dxf': 'dxf', + 'image/emf': 'emf', + 'image/x-emf': 'emf', + 'image/x-targa': 'tga', + 'image/x-sgf': 'sgf', + 'image/x-svm': 'svm', + 'image/wmf': 'wmf', + 'image/x-wmf': 'wmf', + 'image/x-pict': 'pict', + 'image/x-cmx': 'cmx', + 'image/svg+xml': 'svg', + 'image/bmp': 'bmp', + 'image/x-ms-bmp': 'bmp', + 'image/x-MS-bmp': 'bmp', + 'image/x-wpg': 'wpg', + 'image/x-eps': 'eps', + 'image/x-met': 'met', + 'image/x-portable-bitmap': 'pbm', + 'image/x-photo-cd': 'pcd', + 'image/x-pcx': 'pcx', + 'image/x-portable-graymap': 'pgm', + 'image/x-portable-pixmap': 'ppm', + 'image/vnd.adobe.photoshop': 'psd', + 'image/x-cmu-raster': 'ras', + 'image/x-sun-raster': 'ras', + 'image/x-xbitmap': 'xbm', + 'image/x-xpixmap': 'xpm', +} + +# disabled for now, this would download gigs of pngs/jpegs... +common_noncore_mimetypes = { +# graphics + 'image/gif': 'gif', + 'image/jpeg': 'jpeg', + 'image/png': 'png', +# pdf, etc. + 'application/pdf': 'pdf', +} + diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype index a38b6ea95bca..609e6683a0aa 100755 --- a/bin/get-bugzilla-attachments-by-mimetype +++ b/bin/get-bugzilla-attachments-by-mimetype @@ -46,6 +46,7 @@ except: import xmlrpclib from xml.dom import minidom from xml.sax.saxutils import escape +from attachment_mimetypes import mimetypes def urlopen_retry(url): maxretries = 3 @@ -370,163 +371,6 @@ redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id=' #system is a nightmare novellattach = 'https://bugzilla.novell.com/attachment.cgi?id=' -mimetypes = { -# ODF - 'application/vnd.oasis.opendocument.base': 'odb', - 'application/vnd.oasis.opendocument.database': 'odb', - 'application/vnd.oasis.opendocument.chart': 'odc', - 'application/vnd.oasis.opendocument.chart-template': 'otc', - 'application/vnd.oasis.opendocument.formula': 'odf', - 'application/vnd.oasis.opendocument.formula-template': 'otf', - 'application/vnd.oasis.opendocument.graphics': 'odg', - 'application/vnd.oasis.opendocument.graphics-template': 'otg', - 'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg', - 'application/vnd.oasis.opendocument.presentation': 'odp', - 'application/vnd.oasis.opendocument.presentation-template': 'otp', - 'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp', - 'application/vnd.oasis.opendocument.spreadsheet': 'ods', - 'application/vnd.oasis.opendocument.spreadsheet-template': 'ots', - 'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods', - 'application/vnd.oasis.opendocument.text': 'odt', - 'application/vnd.oasis.opendocument.text-flat-xml': 'fodt', - 'application/vnd.oasis.opendocument.text-master': 'odm', - 'application/vnd.oasis.opendocument.text-template': 'ott', - 'application/vnd.oasis.opendocument.text-master-template': 'otm', - 'application/vnd.oasis.opendocument.text-web': 'oth', -# OOo XML - 'application/vnd.sun.xml.base': 'odb', - 'application/vnd.sun.xml.calc': 'sxc', - 'application/vnd.sun.xml.calc.template': 'stc', - 'application/vnd.sun.xml.chart': 'sxs', - 'application/vnd.sun.xml.draw': 'sxd', - 'application/vnd.sun.xml.draw.template': 'std', - 'application/vnd.sun.xml.impress': 'sxi', - 'application/vnd.sun.xml.impress.template': 'sti', - 'application/vnd.sun.xml.math': 'sxm', - 'application/vnd.sun.xml.writer': 'sxw', - 'application/vnd.sun.xml.writer.global': 'sxg', - 'application/vnd.sun.xml.writer.template': 'stw', - 'application/vnd.sun.xml.writer.web': 'stw', -# MSO - 'application/rtf': 'rtf', - 'text/rtf': 'rtf', - 'application/msword': 'doc', - 'application/vnd.ms-powerpoint': 'ppt', - 'application/vnd.ms-excel': 'xls', - 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb', - 'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm', - 'application/vnd.ms-excel.template.macroEnabled.12': 'xltm', - 'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm', - 'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm', - 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm', - 'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm', - 'application/vnd.ms-word.document.macroEnabled.12': 'docm', - 'application/vnd.ms-word.template.macroEnabled.12': 'dotm', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx', - 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx', - 'application/vnd.openxmlformats-officedocument.presentationml.template': 'potx', - 'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx', - 'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx', - 'application/vnd.visio': 'vsd', - 'application/visio.drawing': 'vsd', - 'application/vnd.visio2013': 'vsdx', - 'application/vnd.visio.xml': 'vdx', - 'application/x-mspublisher': 'pub', -#WPS Office - 'application/wps-office.doc': 'doc', - 'application/wps-office.docx': 'docx', - 'application/wps-office.xls': 'xls', - 'application/wps-office.xlsx': 'xlsx', - 'application/wps-office.ppt': 'ppt', - 'application/wps-office.pptx': 'pptx', -# W3C - 'application/xhtml+xml': 'xhtml', - 'application/mathml+xml': 'mml', - 'text/html': 'html', - 'application/docbook+xml': 'docbook', -# misc - 'text/csv': 'csv', - 'text/spreadsheet': 'slk', - 'application/x-qpro': 'qpro', - 'application/x-dbase': 'dbf', - 'application/vnd.corel-draw': 'cdr', - 'application/vnd.lotus-wordpro': 'lwp', - 'application/vnd.lotus-1-2-3': 'wks', - 'application/vnd.wordperfect': 'wpd', - 'application/wordperfect5.1': 'wpd', - 'application/vnd.ms-works': 'wps', - 'application/clarisworks' : 'cwk', - 'application/macwriteii' : 'mw', - 'application/vnd.apple.keynote': 'key', - 'application/vnd.apple.numbers': 'numbers', - 'application/vnd.apple.pages': 'pages', - 'application/x-iwork-keynote-sffkey': 'key', - 'application/x-iwork-numbers-sffnumbers': 'numbers', - 'application/x-iwork-pages-sffpages': 'pages', - 'application/x-hwp': 'hwp', - 'application/x-aportisdoc': 'pdb', - 'application/prs.plucker' : 'pdb_plucker', - 'application/vnd.palm' : 'pdb_palm', - 'application/x-sony-bbeb' : 'lrf', - 'application/x-pocket-word': 'psw', - 'application/x-t602': '602', - 'application/x-fictionbook+xml': 'fb2', - 'application/x-abiword': 'abw', - 'application/x-pagemaker': 'pmd', - 'application/x-gnumeric': 'gnumeric', - 'application/vnd.stardivision.calc': 'sdc', - 'application/vnd.stardivision.draw': 'sda', - 'application/vnd.stardivision.writer': 'sdw', - 'application/x-starcalc': 'sdc', - 'application/x-stardraw': 'sdd', - 'application/x-starwriter': 'sdw', -# relatively uncommon image mimetypes - 'image/x-freehand': 'fh', - 'image/cgm': 'cgm', - 'image/tif': 'tiff', - 'image/tiff': 'tiff', - 'image/vnd.dxf': 'dxf', - 'image/emf': 'emf', - 'image/x-emf': 'emf', - 'image/x-targa': 'tga', - 'image/x-sgf': 'sgf', - 'image/x-svm': 'svm', - 'image/wmf': 'wmf', - 'image/x-wmf': 'wmf', - 'image/x-pict': 'pict', - 'image/x-cmx': 'cmx', - 'image/svg+xml': 'svg', - 'image/bmp': 'bmp', - 'image/x-ms-bmp': 'bmp', - 'image/x-MS-bmp': 'bmp', - 'image/x-wpg': 'wpg', - 'image/x-eps': 'eps', - 'image/x-met': 'met', - 'image/x-portable-bitmap': 'pbm', - 'image/x-photo-cd': 'pcd', - 'image/x-pcx': 'pcx', - 'image/x-portable-graymap': 'pgm', - 'image/x-portable-pixmap': 'ppm', - 'image/vnd.adobe.photoshop': 'psd', - 'image/x-cmu-raster': 'ras', - 'image/x-sun-raster': 'ras', - 'image/x-xbitmap': 'xbm', - 'image/x-xpixmap': 'xpm', -} - -# disabled for now, this would download gigs of pngs/jpegs... -common_noncore_mimetypes = { -# graphics - 'image/gif': 'gif', - 'image/jpeg': 'jpeg', - 'image/png': 'png', -# pdf, etc. - 'application/pdf': 'pdf', -} - class manage_threads(threading.Thread): def run(self): #print(threading.current_thread().get_ident()) diff --git a/bin/get-forum-attachments.py b/bin/get-forum-attachments.py new file mode 100755 index 000000000000..9b967d5a4963 --- /dev/null +++ b/bin/get-forum-attachments.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 + +# This file is part of the LibreOffice project. +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import requests +from requests.adapters import HTTPAdapter +from requests.packages.urllib3.util.retry import Retry +import magic +import tempfile +import os +import shutil +from attachment_mimetypes import mimetypes +from concurrent.futures import ThreadPoolExecutor, as_completed + +# https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages +languages = { + 'en': "https://forum.openoffice.org/en/forum", + 'es': "https://forum.openoffice.org/es/forum", + 'fr': "https://forum.openoffice.org/fr/forum", + 'hu': "https://forum.openoffice.org/hu/forum", + 'it': "https://forum.openoffice.org/it/forum", + 'ja': "https://forum.openoffice.org/ja/forum", + 'nl': "https://forum.openoffice.org/nl/forum", + 'pl': "https://forum.openoffice.org/pl/forum", + 'vi': "https://forum.openoffice.org/vi/forum", + 'tr': "https://forum.libreoffice.org.tr", + 'de': "https://www.openoffice-forum.de", + 'de2': "https://www.libreoffice-forum.de", + 'de3': "https://de.openoffice.info", +} + +def get_attachments_from_url(lang, url): + + print("Checking " + url) + + startPoint = 0 + + # Keep the index and resume from there + indexFile = lang + ".index" + if os.path.isfile(indexFile): + with open(indexFile) as f: + startPoint = int(f.readline().rstrip()) + 1 + else: + if lang == 'hu': + startPoint = 1300 + + session = requests.Session() + retry = Retry(connect=3, backoff_factor=0.5) + adapter = HTTPAdapter(max_retries=retry) + session.mount('http://', adapter) + session.mount('https://', adapter) + + invalidCount = 0 + for i in range(startPoint, 999999): + fileUrl = url + "/download/file.php?id=" + str(i) + h = session.head(fileUrl) + header = h.headers + content_type = header.get('content-type') + if "html" in content_type: + # Let's assume this is an invalid file link + invalidCount += 1 + + # Let's assume, if we get 100 invalid files, that there are no more files + if invalidCount == 100: + print("No more attachments found in " + url) + break + else: + invalidCount = 0 + + if content_type == 'application/octet-stream': + r = session.get(fileUrl, allow_redirects=True) + with tempfile.NamedTemporaryFile() as tmp: + tmp.write(r.content) + mimetype = magic.from_file(tmp.name, mime=True) + if mimetype in mimetypes: + suffix = mimetypes[mimetype] + try: + os.mkdir(suffix) + except: + pass + + download = suffix + '/' + "forum-" + lang + '-' + str(i) + '.' + suffix + + print("Downloading as " + download) + shutil.copy(tmp.name, download) + + # Save the index + with open(indexFile, 'w') as f: + f.write(str(i)) + +if __name__ == '__main__': + + processes = [] + # 10 at a time seems to work fine + with ThreadPoolExecutor(max_workers=10) as executor: + for lang, url in languages.items(): + processes.append(executor.submit(get_attachments_from_url, lang, url)) + + for task in as_completed(processes): + result = task.result() + if result: + print(result) |