summaryrefslogtreecommitdiff
path: root/bin/get-bugzilla-attachments-by-mimetype
diff options
context:
space:
mode:
authorDavid Tardon <dtardon@redhat.com>2013-12-07 13:15:36 +0100
committerDavid Tardon <dtardon@redhat.com>2013-12-07 16:37:04 +0100
commit93b6e31c706cb5b98286fa6368f5483c26ff1505 (patch)
tree48cecc5eb055b0a80876f0c06a471c73bc55584c /bin/get-bugzilla-attachments-by-mimetype
parentd9344d6a4afd0e952acda53de5c8efa3f82437d2 (diff)
try to minimize bugzilla queries
This attempts to solve several deficiencies in the script: 1. If the first attachment of a bug is already downloaded, the bug is not checked for newly added attachments (or attachments with newly fixed mimetype). 2. If neither of the eligible attachment(s) of a bug is the first attachment, the bug will be processed (and the attachment downloaded) time and again (because the shortcut is only applied for the first attachment, see 1). But it also ensures that if the script is killed, the download is restarted on the next run. Change-Id: I7f3d1922825bb314f96ec3b1ee2a0ac47604b018
Diffstat (limited to 'bin/get-bugzilla-attachments-by-mimetype')
-rwxr-xr-xbin/get-bugzilla-attachments-by-mimetype271
1 files changed, 167 insertions, 104 deletions
diff --git a/bin/get-bugzilla-attachments-by-mimetype b/bin/get-bugzilla-attachments-by-mimetype
index 03b9f3278109..3957c0c75fab 100755
--- a/bin/get-bugzilla-attachments-by-mimetype
+++ b/bin/get-bugzilla-attachments-by-mimetype
@@ -21,8 +21,11 @@
from __future__ import print_function
import feedparser
import base64
+import datetime
+import glob
import re
import os, os.path
+import stat
import sys
try:
from urllib.request import urlopen
@@ -49,130 +52,190 @@ def urlopen_retry(url):
def get_from_bug_url_via_xml(url, mimetype, prefix, suffix):
id = url.rsplit('=', 2)[1]
print("id is " + prefix + id + " " + suffix)
- if os.path.isfile(suffix + '/' + prefix + id + '-1.' + suffix):
- print("assuming " + id + " is up to date")
- else:
- print("parsing " + id)
- sock = urlopen_retry(url+"&ctype=xml")
- dom = minidom.parse(sock)
- sock.close()
- attachmentid=0
- for attachment in dom.getElementsByTagName('attachment'):
- attachmentid += 1
- print(" mimetype is", end=' ')
- for node in attachment.childNodes:
- if node.nodeName == 'type':
- print(node.firstChild.nodeValue, end=' ')
- if node.firstChild.nodeValue.lower() != mimetype.lower():
- print('skipping')
- break
- elif node.nodeName == 'data':
- # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
- if not node.firstChild:
- print('deleted attachment, skipping')
- continue
-
- download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
- print('downloading as ' + download)
- f = open(download, 'wb')
- f.write(base64.b64decode(node.firstChild.nodeValue))
- f.close()
+ print("parsing " + id)
+ sock = urlopen_retry(url+"&ctype=xml")
+ dom = minidom.parse(sock)
+ sock.close()
+ attachmentid=0
+ for attachment in dom.getElementsByTagName('attachment'):
+ attachmentid += 1
+ print(" mimetype is", end=' ')
+ for node in attachment.childNodes:
+ if node.nodeName == 'type':
+ print(node.firstChild.nodeValue, end=' ')
+ if node.firstChild.nodeValue.lower() != mimetype.lower():
+ print('skipping')
break
+ elif node.nodeName == 'data':
+ # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml)
+ if not node.firstChild:
+ print('deleted attachment, skipping')
+ continue
+
+ download = suffix + '/' +prefix + id + '-' + str(attachmentid) + '.' + suffix
+ if os.path.isfile(download):
+ print("assuming " + download + " is up to date")
+ continue
+
+ print('downloading as ' + download)
+ f = open(download, 'wb')
+ f.write(base64.b64decode(node.firstChild.nodeValue))
+ f.close()
+ break
def get_novell_bug_via_xml(url, mimetype, prefix, suffix):
id = url.rsplit('=', 2)[1]
print("id is " + prefix + id + " " + suffix)
- if os.path.isfile(suffix + '/' + prefix + id + '-1.' + suffix):
- print("assuming " + id + " is up to date")
- else:
- print("parsing " + id)
- sock = urlopen_retry(url+"&ctype=xml")
- dom = minidom.parse(sock)
- sock.close()
- attachmentid=0
- for comment in dom.getElementsByTagName('thetext'):
- commentText = comment.firstChild.nodeValue
- match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
- if not match:
- continue
-
- attachmentid += 1
-
- download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
- if os.path.isfile(download):
- print("assuming " + download + " is up to date")
- continue
-
- realAttachmentId = match.group(1)
- handle = urlopen_retry(novellattach + realAttachmentId)
- if not handle:
- print("attachment %s is not accessible" % realAttachmentId)
- continue
- print(" mimetype is", end=' ')
-
- info = handle.info()
- if info.get_content_type:
- remoteMime = info.get_content_type()
- else:
- remoteMime = info.gettype()
- print(remoteMime, end=' ')
- if remoteMime != mimetype:
- print("skipping")
- continue
-
- print('downloading as ' + download)
- f = open(download, 'wb')
- f.write(handle.read())
- f.close()
+ print("parsing " + id)
+ sock = urlopen_retry(url+"&ctype=xml")
+ dom = minidom.parse(sock)
+ sock.close()
+ attachmentid=0
+ for comment in dom.getElementsByTagName('thetext'):
+ commentText = comment.firstChild.nodeValue
+ match = re.search(r".*Created an attachment \(id=([0-9]+)\)", commentText)
+ if not match:
+ continue
+
+ attachmentid += 1
+
+ download = suffix + '/' + prefix + id + '-' + str(attachmentid) + '.' + suffix
+ if os.path.isfile(download):
+ print("assuming " + download + " is up to date")
+ continue
+
+ realAttachmentId = match.group(1)
+ handle = urlopen_retry(novellattach + realAttachmentId)
+ if not handle:
+ print("attachment %s is not accessible" % realAttachmentId)
+ continue
+ print(" mimetype is", end=' ')
+
+ info = handle.info()
+ if info.get_content_type:
+ remoteMime = info.get_content_type()
+ else:
+ remoteMime = info.gettype()
+ print(remoteMime, end=' ')
+ if remoteMime != mimetype:
+ print("skipping")
+ continue
+
+ print('downloading as ' + download)
+ f = open(download, 'wb')
+ f.write(handle.read())
+ f.close()
+
+def create_query(mimetype):
+ query = dict()
+ query['query_format']='advanced'
+ query['field0-0-0']='attachments.mimetype'
+ query['type0-0-0']='equals'
+ query['value0-0-0']=mimetype
+ return query
+
+def get_downloaded_files(prefix, suffix):
+ return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix)))
+
+def get_file_bz_ids(files, prefix):
+ return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files])
+
+def get_changed_date(files):
+ newest = max([os.stat(f)[stat.ST_MTIME] for f in files])
+ # Subtract a day to avoid timezone differences. The worst thing that
+ # can happen is that we are going to process more bugs than necessary.
+ return datetime.date.fromtimestamp(newest - 24 * 60 * 60)
def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix):
try:
os.mkdir(suffix)
except:
pass
- try:
- proxy = xmlrpclib.ServerProxy(rpcurl)
- query = dict()
- query['column_list']='bug_id'
- query['query_format']='advanced'
- query['field0-0-0']='attachments.mimetype'
- query['type0-0-0']='equals'
- query['value0-0-0']=mimetype
- result = proxy.Bug.search(query)
- bugs = result['bugs']
- print(str(len(bugs)) + ' bugs to process')
- for bug in bugs:
- url = showurl + str(bug['id'])
- get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
- except xmlrpclib.Fault as err:
- print("A fault occurred")
- print("Fault code: %s" % err.faultCode)
- print(err.faultString)
-
-def get_through_rss_query_url(url, mimetype, prefix, suffix):
+
+ def process(query, full, have=[]):
+ try:
+ proxy = xmlrpclib.ServerProxy(rpcurl)
+ result = proxy.Bug.search(query)
+ bugs = result['bugs']
+ print(str(len(bugs)) + ' bugs to process')
+
+ if full:
+ available = set([str(bug['id']) for bug in bugs])
+ # we already have files from all available bugs
+ if available.difference(set(have)) == set():
+ print("assuming all downloaded files are up to date")
+ return
+
+ for bug in bugs:
+ url = showurl + str(bug['id'])
+ get_from_bug_url_via_xml(url, mimetype, prefix, suffix)
+ except xmlrpclib.Fault as err:
+ print("A fault occurred")
+ print("Fault code: %s" % err.faultCode)
+ print(err.faultString)
+
+ query = create_query(mimetype)
+ query['column_list']='bug_id'
+
+ files = get_downloaded_files(prefix, suffix)
+
+ if files != []:
+ print('looking for updated bugs having %s attachment(s)' % mimetype)
+ query_changed = query.copy()
+ query_changed['field0-1-0'] = 'days_elapsed'
+ query_changed['type0-1-0'] = 'lessthaneq'
+ query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days)
+ process(query_changed, False)
+
+ print('looking for all bugs having %s attachment(s)' % mimetype)
+ process(query, True, get_file_bz_ids(files, prefix))
+
+def get_through_rss_query(queryurl, mimetype, prefix, suffix):
try:
os.mkdir(suffix)
except:
pass
- d = feedparser.parse(url)
#Getting detailed bug information and downloading an attachment body is not possible without logging in to Novell bugzilla
#get_novell_bug_via_xml function is a workaround for that situation
get_bug_function = get_novell_bug_via_xml if prefix == "novell" else get_from_bug_url_via_xml
- for entry in d['entries']:
- try:
- get_bug_function(entry['id'], mimetype, prefix, suffix)
- except KeyboardInterrupt:
- raise # Ctrl+C should work
- except:
- print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
- pass
-
-def get_through_rss_query(queryurl, mimetype, prefix, suffix):
- url = queryurl + '?query_format=advanced&field0-0-0=attachments.mimetype&type0-0-0=equals&value0-0-0=' + escape(mimetype) + '&ctype=rss'
- print('url is ' + url)
- get_through_rss_query_url(url, mimetype, prefix, suffix)
+ def process(query, full, have=[]):
+ url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.iteritems()])
+ print('url is ' + url)
+ d = feedparser.parse(url)
+
+ if full:
+ available = set([str(entry['id'].split('=')[-1]) for entry in d['entries']])
+ # we already have files from all available bugs
+ if available.difference(set(have)) == set():
+ print("assuming all downloaded files are up to date")
+ return
+
+ for entry in d['entries']:
+ try:
+ get_bug_function(entry['id'], mimetype, prefix, suffix)
+ except KeyboardInterrupt:
+ raise # Ctrl+C should work
+ except:
+ print(entry['id'] + " failed: " + str(sys.exc_info()[0]))
+ pass
+
+ query = create_query(escape(mimetype))
+ query['ctype'] = 'rss'
+
+ files = get_downloaded_files(prefix, suffix)
+
+ if files != []:
+ print('looking for updated bugs having %s attachment(s)' % mimetype)
+ query_changed = query.copy()
+ query_changed['field0-1-0'] = 'changed'
+ query_changed['type0-1-0'] = 'changedbefore'
+ query_changed['value0-1-0'] = get_changed_date(files).isoformat()
+ process(query_changed, False)
+
+ print('looking for all bugs having %s attachment(s)' % mimetype)
+ process(query, True, get_file_bz_ids(files, prefix))
def get_launchpad_bugs(prefix):
#launchpadlib python module is required to download launchpad attachments