summaryrefslogtreecommitdiff
path: root/bin
diff options
context:
space:
mode:
authorXisco Fauli <xiscofauli@libreoffice.org>2022-06-06 17:28:57 +0200
committerXisco Fauli <xiscofauli@libreoffice.org>2022-06-07 20:26:13 +0200
commit8b8e9d3126d4232c6c13e6059ab3542a521251d8 (patch)
tree330b8f6e55124605729bb03ec4fb1a3b23184ba7 /bin
parent8741fd0e0ae9e346de2e09887f0668b831c9b48b (diff)
bin: Add script to get attachments from OO forums
Testing it locally, I could download 52.000 documents Reuse mimetypes dictionary from get-bugzilla-attachments-by-mimetype by putting it into an external file Change-Id: I875d90f6119c3c3bdfea6a0efd3bbc8c5be1eb63 Reviewed-on: https://gerrit.libreoffice.org/c/core/+/135457 Tested-by: Jenkins Reviewed-by: Xisco Fauli <xiscofauli@libreoffice.org>
Diffstat (limited to 'bin')
-rw-r--r--bin/attachment_mimetypes.py157
-rwxr-xr-xbin/get-bugzilla-attachments-by-mimetype158
-rwxr-xr-xbin/get-forum-attachments.py106
3 files changed, 264 insertions, 157 deletions
diff --git a/bin/attachment_mimetypes.py b/bin/attachment_mimetypes.py
new file mode 100644
index 000000000000..ede5fcb39fb9
--- /dev/null
+++ b/bin/attachment_mimetypes.py
@@ -0,0 +1,157 @@
+mimetypes = {
+# ODF
+ 'application/vnd.oasis.opendocument.base': 'odb',
+ 'application/vnd.oasis.opendocument.database': 'odb',
+ 'application/vnd.oasis.opendocument.chart': 'odc',
+ 'application/vnd.oasis.opendocument.chart-template': 'otc',
+ 'application/vnd.oasis.opendocument.formula': 'odf',
+ 'application/vnd.oasis.opendocument.formula-template': 'otf',
+ 'application/vnd.oasis.opendocument.graphics': 'odg',
+ 'application/vnd.oasis.opendocument.graphics-template': 'otg',
+ 'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
+ 'application/vnd.oasis.opendocument.presentation': 'odp',
+ 'application/vnd.oasis.opendocument.presentation-template': 'otp',
+ 'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
+ 'application/vnd.oasis.opendocument.spreadsheet': 'ods',
+ 'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
+ 'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
+ 'application/vnd.oasis.opendocument.text': 'odt',
+ 'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
+ 'application/vnd.oasis.opendocument.text-master': 'odm',
+ 'application/vnd.oasis.opendocument.text-template': 'ott',
+ 'application/vnd.oasis.opendocument.text-master-template': 'otm',
+ 'application/vnd.oasis.opendocument.text-web': 'oth',
+# OOo XML
+ 'application/vnd.sun.xml.base': 'odb',
+ 'application/vnd.sun.xml.calc': 'sxc',
+ 'application/vnd.sun.xml.calc.template': 'stc',
+ 'application/vnd.sun.xml.chart': 'sxs',
+ 'application/vnd.sun.xml.draw': 'sxd',
+ 'application/vnd.sun.xml.draw.template': 'std',
+ 'application/vnd.sun.xml.impress': 'sxi',
+ 'application/vnd.sun.xml.impress.template': 'sti',
+ 'application/vnd.sun.xml.math': 'sxm',
+ 'application/vnd.sun.xml.writer': 'sxw',
+ 'application/vnd.sun.xml.writer.global': 'sxg',
+ 'application/vnd.sun.xml.writer.template': 'stw',
+ 'application/vnd.sun.xml.writer.web': 'stw',
+# MSO
+ 'application/rtf': 'rtf',
+ 'text/rtf': 'rtf',
+ 'application/msword': 'doc',
+ 'application/vnd.ms-powerpoint': 'ppt',
+ 'application/vnd.ms-excel': 'xls',
+ 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
+ 'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
+ 'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
+ 'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
+ 'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
+ 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
+ 'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
+ 'application/vnd.ms-word.document.macroEnabled.12': 'docm',
+ 'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
+ 'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx',
+ 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
+ 'application/vnd.openxmlformats-officedocument.presentationml.template': 'potx',
+ 'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx',
+ 'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx',
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
+ 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx',
+ 'application/vnd.visio': 'vsd',
+ 'application/visio.drawing': 'vsd',
+ 'application/vnd.visio2013': 'vsdx',
+ 'application/vnd.visio.xml': 'vdx',
+ 'application/x-mspublisher': 'pub',
+#WPS Office
+ 'application/wps-office.doc': 'doc',
+ 'application/wps-office.docx': 'docx',
+ 'application/wps-office.xls': 'xls',
+ 'application/wps-office.xlsx': 'xlsx',
+ 'application/wps-office.ppt': 'ppt',
+ 'application/wps-office.pptx': 'pptx',
+# W3C
+ 'application/xhtml+xml': 'xhtml',
+ 'application/mathml+xml': 'mml',
+ 'text/html': 'html',
+ 'application/docbook+xml': 'docbook',
+# misc
+ 'text/csv': 'csv',
+ 'text/spreadsheet': 'slk',
+ 'application/x-qpro': 'qpro',
+ 'application/x-dbase': 'dbf',
+ 'application/vnd.corel-draw': 'cdr',
+ 'application/vnd.lotus-wordpro': 'lwp',
+ 'application/vnd.lotus-1-2-3': 'wks',
+ 'application/vnd.wordperfect': 'wpd',
+ 'application/wordperfect5.1': 'wpd',
+ 'application/vnd.ms-works': 'wps',
+ 'application/clarisworks' : 'cwk',
+ 'application/macwriteii' : 'mw',
+ 'application/vnd.apple.keynote': 'key',
+ 'application/vnd.apple.numbers': 'numbers',
+ 'application/vnd.apple.pages': 'pages',
+ 'application/x-iwork-keynote-sffkey': 'key',
+ 'application/x-iwork-numbers-sffnumbers': 'numbers',
+ 'application/x-iwork-pages-sffpages': 'pages',
+ 'application/x-hwp': 'hwp',
+ 'application/x-aportisdoc': 'pdb',
+ 'application/prs.plucker' : 'pdb_plucker',
+ 'application/vnd.palm' : 'pdb_palm',
+ 'application/x-sony-bbeb' : 'lrf',
+ 'application/x-pocket-word': 'psw',
+ 'application/x-t602': '602',
+ 'application/x-fictionbook+xml': 'fb2',
+ 'application/x-abiword': 'abw',
+ 'application/x-pagemaker': 'pmd',
+ 'application/x-gnumeric': 'gnumeric',
+ 'application/vnd.stardivision.calc': 'sdc',
+ 'application/vnd.stardivision.draw': 'sda',
+ 'application/vnd.stardivision.writer': 'sdw',
+ 'application/x-starcalc': 'sdc',
+ 'application/x-stardraw': 'sdd',
+ 'application/x-starwriter': 'sdw',
+# relatively uncommon image mimetypes
+ 'image/x-freehand': 'fh',
+ 'image/cgm': 'cgm',
+ 'image/tif': 'tiff',
+ 'image/tiff': 'tiff',
+ 'image/vnd.dxf': 'dxf',
+ 'image/emf': 'emf',
+ 'image/x-emf': 'emf',
+ 'image/x-targa': 'tga',
+ 'image/x-sgf': 'sgf',
+ 'image/x-svm': 'svm',
+ 'image/wmf': 'wmf',
+ 'image/x-wmf': 'wmf',
+ 'image/x-pict': 'pict',
+ 'image/x-cmx': 'cmx',
+ 'image/svg+xml': 'svg',
+ 'image/bmp': 'bmp',
+ 'image/x-ms-bmp': 'bmp',
+ 'image/x-MS-bmp': 'bmp',
+ 'image/x-wpg': 'wpg',
+ 'image/x-eps': 'eps',
+ 'image/x-met': 'met',
+ 'image/x-portable-bitmap': 'pbm',
+ 'image/x-photo-cd': 'pcd',
+ 'image/x-pcx': 'pcx',
+ 'image/x-portable-graymap': 'pgm',
+ 'image/x-portable-pixmap': 'ppm',
+ 'image/vnd.adobe.photoshop': 'psd',
+ 'image/x-cmu-raster': 'ras',
+ 'image/x-sun-raster': 'ras',
+ 'image/x-xbitmap': 'xbm',
+ 'image/x-xpixmap': 'xpm',
+}
+
+# disabled for now, this would download gigs of pngs/jpegs...
+common_noncore_mimetypes = {
+# graphics
+ 'image/gif': 'gif',
+ 'image/jpeg': 'jpeg',
+ 'image/png': 'png',
+# pdf, etc.
+ 'application/pdf': 'pdf',
+}
+
diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype
index a38b6ea95bca..609e6683a0aa 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -46,6 +46,7 @@ except:
import xmlrpclib
from xml.dom import minidom
from xml.sax.saxutils import escape
+from attachment_mimetypes import mimetypes
def urlopen_retry(url):
maxretries = 3
@@ -370,163 +371,6 @@ redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id='
#system is a nightmare
novellattach = 'https://bugzilla.novell.com/attachment.cgi?id='
-mimetypes = {
-# ODF
- 'application/vnd.oasis.opendocument.base': 'odb',
- 'application/vnd.oasis.opendocument.database': 'odb',
- 'application/vnd.oasis.opendocument.chart': 'odc',
- 'application/vnd.oasis.opendocument.chart-template': 'otc',
- 'application/vnd.oasis.opendocument.formula': 'odf',
- 'application/vnd.oasis.opendocument.formula-template': 'otf',
- 'application/vnd.oasis.opendocument.graphics': 'odg',
- 'application/vnd.oasis.opendocument.graphics-template': 'otg',
- 'application/vnd.oasis.opendocument.graphics-flat-xml': 'fodg',
- 'application/vnd.oasis.opendocument.presentation': 'odp',
- 'application/vnd.oasis.opendocument.presentation-template': 'otp',
- 'application/vnd.oasis.opendocument.presentation-flat-xml': 'fodp',
- 'application/vnd.oasis.opendocument.spreadsheet': 'ods',
- 'application/vnd.oasis.opendocument.spreadsheet-template': 'ots',
- 'application/vnd.oasis.opendocument.spreadsheet-flat-xml': 'fods',
- 'application/vnd.oasis.opendocument.text': 'odt',
- 'application/vnd.oasis.opendocument.text-flat-xml': 'fodt',
- 'application/vnd.oasis.opendocument.text-master': 'odm',
- 'application/vnd.oasis.opendocument.text-template': 'ott',
- 'application/vnd.oasis.opendocument.text-master-template': 'otm',
- 'application/vnd.oasis.opendocument.text-web': 'oth',
-# OOo XML
- 'application/vnd.sun.xml.base': 'odb',
- 'application/vnd.sun.xml.calc': 'sxc',
- 'application/vnd.sun.xml.calc.template': 'stc',
- 'application/vnd.sun.xml.chart': 'sxs',
- 'application/vnd.sun.xml.draw': 'sxd',
- 'application/vnd.sun.xml.draw.template': 'std',
- 'application/vnd.sun.xml.impress': 'sxi',
- 'application/vnd.sun.xml.impress.template': 'sti',
- 'application/vnd.sun.xml.math': 'sxm',
- 'application/vnd.sun.xml.writer': 'sxw',
- 'application/vnd.sun.xml.writer.global': 'sxg',
- 'application/vnd.sun.xml.writer.template': 'stw',
- 'application/vnd.sun.xml.writer.web': 'stw',
-# MSO
- 'application/rtf': 'rtf',
- 'text/rtf': 'rtf',
- 'application/msword': 'doc',
- 'application/vnd.ms-powerpoint': 'ppt',
- 'application/vnd.ms-excel': 'xls',
- 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': 'xlsb',
- 'application/vnd.ms-excel.sheet.macroEnabled.12': 'xlsm',
- 'application/vnd.ms-excel.template.macroEnabled.12': 'xltm',
- 'application/vnd.ms-powerpoint.presentation.macroEnabled.12': 'pptm',
- 'application/vnd.ms-powerpoint.slide.macroEnabled.12': 'sldm',
- 'application/vnd.ms-powerpoint.slideshow.macroEnabled.12': 'ppsm',
- 'application/vnd.ms-powerpoint.template.macroEnabled.12': 'potm',
- 'application/vnd.ms-word.document.macroEnabled.12': 'docm',
- 'application/vnd.ms-word.template.macroEnabled.12': 'dotm',
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'xlsx',
- 'application/vnd.openxmlformats-officedocument.spreadsheetml.template': 'xltx',
- 'application/vnd.openxmlformats-officedocument.presentationml.presentation': 'pptx',
- 'application/vnd.openxmlformats-officedocument.presentationml.template': 'potx',
- 'application/vnd.openxmlformats-officedocument.presentationml.slideshow': 'ppsx',
- 'application/vnd.openxmlformats-officedocument.presentationml.slide': 'sldx',
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
- 'application/vnd.openxmlformats-officedocument.wordprocessingml.template': 'dotx',
- 'application/vnd.visio': 'vsd',
- 'application/visio.drawing': 'vsd',
- 'application/vnd.visio2013': 'vsdx',
- 'application/vnd.visio.xml': 'vdx',
- 'application/x-mspublisher': 'pub',
-#WPS Office
- 'application/wps-office.doc': 'doc',
- 'application/wps-office.docx': 'docx',
- 'application/wps-office.xls': 'xls',
- 'application/wps-office.xlsx': 'xlsx',
- 'application/wps-office.ppt': 'ppt',
- 'application/wps-office.pptx': 'pptx',
-# W3C
- 'application/xhtml+xml': 'xhtml',
- 'application/mathml+xml': 'mml',
- 'text/html': 'html',
- 'application/docbook+xml': 'docbook',
-# misc
- 'text/csv': 'csv',
- 'text/spreadsheet': 'slk',
- 'application/x-qpro': 'qpro',
- 'application/x-dbase': 'dbf',
- 'application/vnd.corel-draw': 'cdr',
- 'application/vnd.lotus-wordpro': 'lwp',
- 'application/vnd.lotus-1-2-3': 'wks',
- 'application/vnd.wordperfect': 'wpd',
- 'application/wordperfect5.1': 'wpd',
- 'application/vnd.ms-works': 'wps',
- 'application/clarisworks' : 'cwk',
- 'application/macwriteii' : 'mw',
- 'application/vnd.apple.keynote': 'key',
- 'application/vnd.apple.numbers': 'numbers',
- 'application/vnd.apple.pages': 'pages',
- 'application/x-iwork-keynote-sffkey': 'key',
- 'application/x-iwork-numbers-sffnumbers': 'numbers',
- 'application/x-iwork-pages-sffpages': 'pages',
- 'application/x-hwp': 'hwp',
- 'application/x-aportisdoc': 'pdb',
- 'application/prs.plucker' : 'pdb_plucker',
- 'application/vnd.palm' : 'pdb_palm',
- 'application/x-sony-bbeb' : 'lrf',
- 'application/x-pocket-word': 'psw',
- 'application/x-t602': '602',
- 'application/x-fictionbook+xml': 'fb2',
- 'application/x-abiword': 'abw',
- 'application/x-pagemaker': 'pmd',
- 'application/x-gnumeric': 'gnumeric',
- 'application/vnd.stardivision.calc': 'sdc',
- 'application/vnd.stardivision.draw': 'sda',
- 'application/vnd.stardivision.writer': 'sdw',
- 'application/x-starcalc': 'sdc',
- 'application/x-stardraw': 'sdd',
- 'application/x-starwriter': 'sdw',
-# relatively uncommon image mimetypes
- 'image/x-freehand': 'fh',
- 'image/cgm': 'cgm',
- 'image/tif': 'tiff',
- 'image/tiff': 'tiff',
- 'image/vnd.dxf': 'dxf',
- 'image/emf': 'emf',
- 'image/x-emf': 'emf',
- 'image/x-targa': 'tga',
- 'image/x-sgf': 'sgf',
- 'image/x-svm': 'svm',
- 'image/wmf': 'wmf',
- 'image/x-wmf': 'wmf',
- 'image/x-pict': 'pict',
- 'image/x-cmx': 'cmx',
- 'image/svg+xml': 'svg',
- 'image/bmp': 'bmp',
- 'image/x-ms-bmp': 'bmp',
- 'image/x-MS-bmp': 'bmp',
- 'image/x-wpg': 'wpg',
- 'image/x-eps': 'eps',
- 'image/x-met': 'met',
- 'image/x-portable-bitmap': 'pbm',
- 'image/x-photo-cd': 'pcd',
- 'image/x-pcx': 'pcx',
- 'image/x-portable-graymap': 'pgm',
- 'image/x-portable-pixmap': 'ppm',
- 'image/vnd.adobe.photoshop': 'psd',
- 'image/x-cmu-raster': 'ras',
- 'image/x-sun-raster': 'ras',
- 'image/x-xbitmap': 'xbm',
- 'image/x-xpixmap': 'xpm',
-}
-
-# disabled for now, this would download gigs of pngs/jpegs...
-common_noncore_mimetypes = {
-# graphics
- 'image/gif': 'gif',
- 'image/jpeg': 'jpeg',
- 'image/png': 'png',
-# pdf, etc.
- 'application/pdf': 'pdf',
-}
-
class manage_threads(threading.Thread):
def run(self):
#print(threading.current_thread().get_ident())
diff --git a/bin/get-forum-attachments.py b/bin/get-forum-attachments.py
new file mode 100755
index 000000000000..9b967d5a4963
--- /dev/null
+++ b/bin/get-forum-attachments.py
@@ -0,0 +1,106 @@
+#!/usr/bin/env python3
+
+# This file is part of the LibreOffice project.
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import requests
+from requests.adapters import HTTPAdapter
+from requests.packages.urllib3.util.retry import Retry
+import magic
+import tempfile
+import os
+import shutil
+from attachment_mimetypes import mimetypes
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+# https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages
+languages = {
+ 'en': "https://forum.openoffice.org/en/forum",
+ 'es': "https://forum.openoffice.org/es/forum",
+ 'fr': "https://forum.openoffice.org/fr/forum",
+ 'hu': "https://forum.openoffice.org/hu/forum",
+ 'it': "https://forum.openoffice.org/it/forum",
+ 'ja': "https://forum.openoffice.org/ja/forum",
+ 'nl': "https://forum.openoffice.org/nl/forum",
+ 'pl': "https://forum.openoffice.org/pl/forum",
+ 'vi': "https://forum.openoffice.org/vi/forum",
+ 'tr': "https://forum.libreoffice.org.tr",
+ 'de': "https://www.openoffice-forum.de",
+ 'de2': "https://www.libreoffice-forum.de",
+ 'de3': "https://de.openoffice.info",
+}
+
+def get_attachments_from_url(lang, url):
+
+ print("Checking " + url)
+
+ startPoint = 0
+
+ # Keep the index and resume from there
+ indexFile = lang + ".index"
+ if os.path.isfile(indexFile):
+ with open(indexFile) as f:
+ startPoint = int(f.readline().rstrip()) + 1
+ else:
+ if lang == 'hu':
+ startPoint = 1300
+
+ session = requests.Session()
+ retry = Retry(connect=3, backoff_factor=0.5)
+ adapter = HTTPAdapter(max_retries=retry)
+ session.mount('http://', adapter)
+ session.mount('https://', adapter)
+
+ invalidCount = 0
+ for i in range(startPoint, 999999):
+ fileUrl = url + "/download/file.php?id=" + str(i)
+ h = session.head(fileUrl)
+ header = h.headers
+ content_type = header.get('content-type')
+ if "html" in content_type:
+ # Let's assume this is an invalid file link
+ invalidCount += 1
+
+ # Let's assume, if we get 100 invalid files, that there are no more files
+ if invalidCount == 100:
+ print("No more attachments found in " + url)
+ break
+ else:
+ invalidCount = 0
+
+ if content_type == 'application/octet-stream':
+ r = session.get(fileUrl, allow_redirects=True)
+ with tempfile.NamedTemporaryFile() as tmp:
+ tmp.write(r.content)
+ mimetype = magic.from_file(tmp.name, mime=True)
+ if mimetype in mimetypes:
+ suffix = mimetypes[mimetype]
+ try:
+ os.mkdir(suffix)
+ except:
+ pass
+
+ download = suffix + '/' + "forum-" + lang + '-' + str(i) + '.' + suffix
+
+ print("Downloading as " + download)
+ shutil.copy(tmp.name, download)
+
+ # Save the index
+ with open(indexFile, 'w') as f:
+ f.write(str(i))
+
+if __name__ == '__main__':
+
+ processes = []
+ # 10 at a time seems to work fine
+ with ThreadPoolExecutor(max_workers=10) as executor:
+ for lang, url in languages.items():
+ processes.append(executor.submit(get_attachments_from_url, lang, url))
+
+ for task in as_completed(processes):
+ result = task.result()
+ if result:
+ print(result)