1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
|
import json
import xml.dom.minidom as minidom
class Article(object):
itemTag = {"content_type":"text/x-wiki","type":"article"}
#itemTag = {"content_type":"text/x-wiki","type":"article","wikiident":"lo","url":"http://asdlkf/","source-url":"http://sourceurl/","source":"http://source/"}
attributes = {}
include = True #""" True if this article should be included in the metabook """
def __init__(self,attributes):
self.attributes = attributes
def getInclude(self):
""" @return True if this article should be included in the metabook """
return self.include
def toDict(self):
#if not self.include: return None
article = self.itemTag.copy()
article.update(self.attributes) # merge dicts
return article
class Metabook(object):
"""
I am your metabook and wish you a pleasant evening.
Sequence of usage:
m = Metabook()
m.loadTemplate(...)
m.loadArticles(xml input)
m.createBook()
m.write(output)
If template, in- and output are files, use fromFileToFile()
"""
ArticleClass = Article # final
artTags = ["title"] # final
m = {} # Dict metabook
template = None
items = []
#source = "" # String input file, xmldump
#dest = "" # FileObject destination of json metabook
def getClone(self):
m = Metabook()
m.template = self.template # No copy() necessary here
m.ArticleClass = self.ArticleClass
m.artTags = self.artTags
#m.m = self.m.copy()
#m.dest = self.dest
return m
def getArtTags(self,filename,tagnames):
"""
Get Article Tags
Reads all specified tags from an xml file and returns a list of all tags.
@filename XML-file
@tagnames List of String Tagnames
@return List of Dict<String Tagname, String Value>
"""
dom=minidom.parse(filename)
out = []
elements=dom.getElementsByTagName("page")
for element in elements:
tagdict = {}
for tagname in tagnames:
tags = element.getElementsByTagName(tagname)
if len(tags) > 0:
tagdict[tagname] = self.getText(tags[0])
else:
tagdict[tagname] = ""
out.append(tagdict)
return out
def getText(self,element):
"""
@element xml Node
@return String content
"""
return element.childNodes[0].data
def load_data(self,filename):
""" Unserialize data from jsonfile """
with open(filename, "r") as infile:
outdict = json.load(infile)
return outdict
def loadTemplate(self,jsonStruct):
"""
Loads an existing json file at the beginning
@jsonStruct File object
"""
self.template = json.load(jsonStruct)
#self.m = self.load_data(source)
def loadArticles(self,source):
"""
Loads the articles and saves them as objects to self.items
"""
pages = self.getArtTags(source,self.artTags)
self.items = [self.ArticleClass(page) for page in pages]
"""return
items=[]
for page in pages:
item = self.ArticleClass(page)
if item.getInclude():
items.append(item.toDict())
self.m["items"] = items
"""
def createBook(self):
"""
Convert all article objects to dicts and merge them with the template.
The result is saved to self.m
"""
if self.template is None:
self.m = []
else:
self.m = self.template.copy()
self.m["items"] = []
for item in self.items:
if item.getInclude():
self.m["items"].append(item.toDict())
def __call__(self,source):
"""
Creates a metabook for @source and writes it to self.m. To continue,
use write()
@source xml-dump
"""
self.loadArticles(source)
self.createBook()
def write(self,dest):
json.dump(self.m,dest)
def fromFileToFile(jsonStructFile,xmldump,output):
"""
Creates a Metabook from a file and writes it to a file.
Short cut Function. This loads a metabook template file, creates the
metabook content from @xmldump and writes the book to @output.
@jsonStructFile String path to Metabook template
@xmldump String path
@output String path
"""
#m = MetabookTranslated()
with open(jsonStructFile,"r") as f:
self.loadTemplate(f)
self.__call__(xmldump)
with open(output,"w") as f:
self.write(f)
|