1
2
3
4
5 package org.astrogrid.registry.server.harvest;
6
7 import org.astrogrid.registry.server.XQueryExecution;
8 import java.rmi.RemoteException;
9
10
11 import java.io.IOException;
12 import org.xml.sax.SAXException;
13 import javax.xml.parsers.ParserConfigurationException;
14 import org.w3c.dom.Document;
15 import javax.xml.parsers.DocumentBuilderFactory;
16 import javax.xml.parsers.DocumentBuilder;
17 import org.w3c.dom.Element;
18 import org.w3c.dom.NodeList;
19 import org.w3c.dom.Node;
20 import org.w3c.dom.NamedNodeMap;
21 import org.xml.sax.InputSource;
22 import org.astrogrid.registry.server.RegistryServerHelper;
23 import org.astrogrid.registry.server.QueryHelper;
24 import org.astrogrid.registry.server.admin.RegistryAdminService;
25 import org.astrogrid.registry.server.query.RegistryQueryService;
26 import java.net.URL;
27 import java.io.Reader;
28 import java.io.StringReader;
29 import java.util.Date;
30 import java.text.SimpleDateFormat;
31 import java.util.HashMap;
32 import java.util.ArrayList;
33 import java.util.Vector;
34 import java.util.Hashtable;
35
36 import org.apache.axis.client.Call;
37 import org.apache.axis.client.Service;
38 import org.apache.axis.message.SOAPBodyElement;
39 import org.apache.commons.logging.Log;
40 import org.apache.commons.logging.LogFactory;
41 import javax.xml.rpc.ServiceException;
42 import org.astrogrid.util.DomHelper;
43 import org.astrogrid.config.Config;
44 import org.astrogrid.registry.RegistryException;
45
46 import org.astrogrid.registry.common.WSDLInformation;
47 import org.astrogrid.registry.common.WSDLBasicInformation;
48 import org.astrogrid.registry.common.XSLHelper;
49
50 import java.net.MalformedURLException;
51 import org.apache.axis.AxisFault;
52 import org.astrogrid.xmldb.eXist.server.QueryDBService;
53
54 /***
55 *
56 * RegistryHarvestService is no longer a web service class, but still posses the
57 * harvesting mechanism that is used by server side servlets which uses
58 * automatic harvest mechanism and manual harvest by the user.
59 */
60 public class RegistryHarvestService {
61
62 private static final Log log =
63 LogFactory.getLog(RegistryHarvestService.class);
64 private static final String HARVEST_TEMPLATE_URL_PROPERTY =
65 "org.astrogrid.registry.harvest.template.url";
66
67 public static Config conf = null;
68
69 static {
70 if(conf == null) {
71 conf = org.astrogrid.config.SimpleConfig.getSingleton();
72 }
73 }
74
75
76 /***
77 * Takes a Resource entry (a Registry type entry). And performs a full replicate
78 * or harvest from that registry, to populate this registry. Usually there is only one
79 * Registry Resource, but there might be more.
80 *
81 * @param query XML document object representing the query language used on the registry.
82 * @return null (nothing is returned on this web service operation).
83 * @author Kevin Benson
84 */
85 public Document harvestResource(Node resource,Date dt) throws RegistryException, IOException {
86 log.debug("start harvestResource");
87 log.info("update harvestResource");
88
89
90 boolean harvestEnabled = conf.getBoolean("registry.harvest.enabled",false);
91 if(!harvestEnabled) {
92 return null;
93 }
94
95
96
97
98
99
100
101
102
103
104
105
106 NodeList nl = null;
107 if(Node.DOCUMENT_NODE == resource.getNodeType()) {
108 nl = ((Document)resource).getElementsByTagNameNS("*","Resource");
109 }
110 else if(Node.ELEMENT_NODE == resource.getNodeType()) {
111 nl = ((Element)resource).getElementsByTagNameNS("*","Resource");
112 }
113 for(int i = 0; i < nl.getLength();i++) {
114 Element elem = (Element) nl.item(i);
115
116
117
118
119
120
121
122
123
124
125
126
127
128 }
129 log.info("exiting harvestResource");
130 log.debug("end harvestResource");
131 return null;
132 }
133
134 /***
135 * Will start a harvest of all the Registries known to this registry.
136 *
137 * @param resources XML document object representing the query language used on the registry.
138 * @return XML docuemnt object representing the result of the query.
139 * @author Kevin Benson
140 */
141 public void harvestAll(boolean onlyRegistries, boolean useDates) throws RegistryException {
142 log.debug("start harvestAll");
143 Document harvestDoc = null;
144 String xqlQuery = null;
145 String ident = null;
146 onlyRegistries = true;
147 boolean harvestEnabled = conf.getBoolean("registry.harvest.enabled",false);
148 if(!harvestEnabled) {
149 return;
150 }
151
152
153 String versionNumber = null;
154
155 String collectionName = "";
156 QueryDBService qdb = new QueryDBService();
157
158 RegistryAdminService ras = new RegistryAdminService();
159 Document tempDoc = null;
160 try {
161 tempDoc = DomHelper.newDocument();
162 if(onlyRegistries) {
163
164
165
166 RegistryQueryService rqs = new RegistryQueryService();
167 ArrayList versions = rqs.getAstrogridVersions();
168 System.out.println("the number of versions = " + versions);
169 for(int k = 0;k < versions.size();k++) {
170 try {
171 System.out.println("begin work on version = " + (String)versions.get(k));
172 harvestDoc = rqs.getRegistriesQuery((String)versions.get(k));
173
174
175
176 ras.updateNoCheck(harvestDoc,(String)versions.get(k));
177
178
179 NodeList nl = harvestDoc.getElementsByTagNameNS("*","Resource");
180 log.info("Harvest All found this number of resources = " + nl.getLength());
181 for(int i = 0; i < nl.getLength();i++) {
182 Element elem = (Element) nl.item(i);
183 versionNumber = RegistryServerHelper.getRegistryVersionFromNode(elem);
184 versionNumber = versionNumber.replace('.','_');
185 if(useDates) {
186 String dateString = null;
187 try {
188 Document statDoc = qdb.getResource("statv"+versionNumber,RegistryServerHelper.getIdentifier(elem));
189 dateString = DomHelper.getNodeTextValue(statDoc,"StatsDateMillis");
190 }catch(Exception e) {
191 log.warn("ignore for now: could not find a stat/date for element using no date.");
192 }
193 Date dt = null;
194 if(dateString != null && dateString.trim().length() > 0) {
195 dt = new Date(Long.parseLong(dateString));
196 }
197
198 beginHarvest(elem,dt,(String)versions.get(k));
199 }else {
200
201 beginHarvest(elem,null,(String)versions.get(k));
202 }
203 }
204 }catch(Exception e) {
205 log.error("Found exception, but still need to harvest other versions:" + e.getMessage());
206 }
207 }
208 }
209 }catch(ParserConfigurationException pce) {
210 throw new RegistryException(pce);
211 }
212
213
214 }
215
216 /***
217 * Small Thread class to update the registry with a particular number
218 * of Resources. This class inherits from the Thread class, so
219 * the harvesting can continue to keep harvesting more Resources.
220 * Some Registries require paging through the Resources hence, the
221 * multithreading helps performance.
222 *
223 *
224 */
225 private class HarvestThread extends Thread {
226
227 private RegistryAdminService ras = null;
228 private Document updateDoc = null;
229
230 /***
231 * HarvestThread class constructor.
232 * @param ras The RegistryAdminService class which does updates of Resources
233 * @param updateDoc a set of one or more Resources.
234 */
235 public HarvestThread(RegistryAdminService ras, Node updateNode) throws RegistryException {
236 this.ras = ras;
237 if(updateNode instanceof Element) {
238 try {
239 updateDoc = DomHelper.newDocument();
240 updateDoc.appendChild(updateDoc.importNode(updateNode, true));
241 }catch(ParserConfigurationException pce) {
242 throw new RegistryException(pce);
243 }
244 }else if(updateNode instanceof Document) {
245 this.updateDoc = (Document)updateNode;
246 }
247 }
248
249 /***
250 * Begin the update, Calls updateNoCheck from RegistryAdminService, because
251 * it is assumed the Rewsources have been checked and valid and require no
252 * special checking.
253 */
254
255 public void run() {
256
257 try {
258 ras.updateNoCheck(updateDoc,null);
259
260
261
262
263
264 }catch(IOException ioe) {
265 ioe.printStackTrace();
266 }
267 }
268
269 }
270
271 /***
272 * This is the main method which uses the HarvestThread class to begin
273 * harvesting and updates. This method will interrogate Resource entries
274 * and see how to call the Resources via the AccessURL and determine if
275 * it is a WebService or WebBrowser. Then makes the appropriately call
276 * to the web service or browser grabbing there Resources and update into
277 * this Registry.
278 *
279 * @param dt An optional date used to harvest from a particular date
280 * @param resources Set of Resources to harvest on, normally a Registry Resource.
281 */
282 public void beginHarvest(Node resource, Date dt, String version) throws RegistryException, IOException {
283 log.debug("start beginHarvest");
284 log.info("entered beginharvest");
285 int failureCount = 0;
286 boolean resumptionSuccess = false;
287 String accessURL = null;
288 String invocationType = null;
289 boolean isRegistryType;
290 Document doc = null;
291 NodeList nl = null;
292 String soapActionURI = null;
293 SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
294 int threadCount = 0;
295
296
297 RegistryAdminService ras = new RegistryAdminService();
298
299 System.out.println(resource.getNodeName() + " " + resource.getNodeValue());
300
301 NamedNodeMap attributes = resource.getAttributes();
302
303
304
305 Node typeAttribute = resource.getAttributes().getNamedItem("xsi:type");
306 isRegistryType = (typeAttribute != null) &&
307 (typeAttribute.getNodeValue().indexOf("Registry") >= 0);
308
309
310 nl = ((Element) resource).getElementsByTagNameNS("*","AccessURL");
311 if(nl.getLength() == 0) {
312 nl = ((Element) resource).getElementsByTagNameNS("*","accessURL");
313 }
314 if(nl.getLength() == 0) {
315 log.error("Error did not find a AccessURL");
316 throw new RegistryException("No accessURL found");
317 }
318 if(!nl.item(0).hasChildNodes()) {
319 log.error("Error did not find any text to the accessURL");
320 throw new RegistryException("No text found for the accessURL");
321 }
322 accessURL = nl.item(0).getFirstChild().getNodeValue();
323
324 nl = ((Element) resource).getElementsByTagNameNS("*","Invocation");
325 if(nl.getLength() == 0) {
326
327 nl = ((Element) resource).getElementsByTagNameNS("*","interface");
328 if(nl.getLength() > 0) {
329 typeAttribute = ((Element)nl.item(0)).getAttributes().getNamedItem("xsi:type");
330 invocationType = typeAttribute.getNodeValue();
331 }
332 } else {
333 invocationType = nl.item(0).getFirstChild().getNodeValue();
334 }
335
336 if(accessURL.indexOf("?wsdl") != -1) {
337 accessURL = accessURL.substring(0,accessURL.indexOf("?wsdl"));
338 }
339
340
341
342 log.info("The access URL = " + accessURL + " invocationType = " + invocationType);
343
344
345
346 if("WebService".endsWith(invocationType)) {
347
348
349 Element childElem = null;
350 Element root = null;
351
352 if("?wsdl".indexOf(accessURL) == -1) {
353 accessURL += "?wsdl";
354 }
355
356 WSDLBasicInformation wsdlBasic = null;
357 try {
358 wsdlBasic = WSDLInformation.getBasicInformationFromURL(accessURL);
359 } catch(RegistryException re) {
360 re.printStackTrace();
361 log.error(re);
362 }
363 if(wsdlBasic != null) {
364 log.info("calling call obj with endpoint = " +
365 (String)wsdlBasic.getEndPoint().values().iterator().next());
366
367 Call callObj = getCall((String)wsdlBasic.getEndPoint().
368 values().iterator().next());
369
370 try {
371 doc = DomHelper.newDocument();
372
373 String interfaceMethod = "getMetaData";
374 if(isRegistryType) interfaceMethod = "ListRecords";
375 String nameSpaceURI = WSDLInformation.
376 getNameSpaceFromBinding(
377 accessURL,interfaceMethod);
378 if(wsdlBasic.getEndPoint().keys().hasMoreElements()) {
379 soapActionURI = wsdlBasic.getSoapActionURI(
380 (String)wsdlBasic.getEndPoint().keys().nextElement() +
381 "_" + interfaceMethod);
382 }
383 if(soapActionURI != null) {
384 callObj.setSOAPActionURI(soapActionURI);
385 }
386 log.info("SoapActionURI = " + soapActionURI);
387 root = doc.createElementNS(nameSpaceURI,interfaceMethod);
388 if(dt != null) {
389 childElem = doc.createElement("from");
390 childElem.appendChild(doc.createTextNode(sdf.format(dt)));
391 root.appendChild(childElem);
392 }
393 doc.appendChild(root);
394 log.info("Creating soap request for operation name = " +
395 interfaceMethod + " with namespaceuri = " +
396 nameSpaceURI);
397
398 SOAPBodyElement sbeRequest = new SOAPBodyElement(
399 doc.getDocumentElement());
400
401 sbeRequest.setName(interfaceMethod);
402 sbeRequest.setNamespaceURI(wsdlBasic.getTargetNameSpace());
403
404 log.info("Calling invoke on service");
405 Vector result = (Vector) callObj.invoke
406 (new Object[] {sbeRequest});
407
408 if(result.size() > 0) {
409 SOAPBodyElement sbe = (SOAPBodyElement) result.get(0);
410 Document soapDoc = sbe.getAsDocument();
411 log.info("SOAPDOC RETURNED = " + DomHelper.DocumentToString(soapDoc));
412
413 ras.updateNoCheck(soapDoc,version);
414 if(isRegistryType) {
415 nl = DomHelper.getNodeListTags(soapDoc,"resumptionToken");
416 while(nl.getLength() > 0) {
417 Document resumeDoc = DomHelper.newDocument();
418 root = doc.createElementNS(nameSpaceURI,"ListRecords");
419 childElem = doc.createElement("resumptionToken");
420 childElem.appendChild(doc.createTextNode(nl.item(0).getFirstChild().getNodeValue()));
421 sbeRequest = new SOAPBodyElement(resumeDoc.getDocumentElement());
422 sbeRequest.setName("ListRecords");
423 sbeRequest.setNamespaceURI(wsdlBasic.getTargetNameSpace());
424
425 result = (Vector) callObj.invoke
426 (new Object[] {sbeRequest});
427 soapDoc = sbe.getAsDocument();
428
429 ras.updateNoCheck(soapDoc,version);
430 nl = DomHelper.getNodeListTags(soapDoc,"resumptionToken");
431 threadCount++;
432 if(threadCount > 19) {
433 log.info("20 harvest threads have started recently, sleeping for 5 seconds. ");
434 log.info("The activethread count = " + Thread.activeCount());
435 try {
436 Thread.sleep(5000);
437 }catch(InterruptedException ie) {
438 log.info("Possible interruption in the middle of harvest");
439 }
440 threadCount = 0;
441 }
442 }
443 }
444 }
445 } catch(RemoteException re) {
446
447 re.printStackTrace();
448 log.error(re);
449 }
450 catch(ParserConfigurationException pce) {
451 pce.printStackTrace();
452 log.error(pce);
453 }
454 catch(Exception e) {
455 e.printStackTrace();
456 log.error(e);
457 }
458 }
459 }else if("WebBrowser".endsWith(invocationType) || "Extended".endsWith(invocationType)) {
460
461 try {
462 String ending = "";
463
464
465 log.info("inside the web browser");
466
467 if(accessURL.indexOf("?") == -1) {
468 ending = "?verb=ListRecords&metadataPrefix=ivo_vor";
469 if(dt != null) {
470 ending += "&from=" + sdf.format(dt);
471 }
472 }
473
474 log.info("Grabbing Document from this url = " + accessURL + ending);
475 doc = DomHelper.newDocument(new URL(accessURL + ending));
476 log.info("Okay got this far to reading the url doc = " +
477 DomHelper.DocumentToString(doc));
478
479 ras.updateNoCheck(doc,version);
480 NodeList moreTokens = null;
481
482
483
484
485 while( doc != null && (moreTokens = doc.getElementsByTagName("resumptionToken")).
486 getLength() > 0 && moreTokens.item(0).hasChildNodes()) {
487 Node nd = moreTokens.item(0);
488 if(accessURL.indexOf("?") != -1) {
489 accessURL = accessURL.substring(0,accessURL.indexOf("?"));
490 }
491 ending = "?verb=ListRecords&resumptionToken=" +
492 nd.getFirstChild().getNodeValue();
493 log.info(
494 "the harvestcallregistry's with resumptionToken accessurl inside the token calls = " +
495 accessURL + ending);
496 while(failureCount <= 2 && !resumptionSuccess) {
497 try {
498 doc = DomHelper.newDocument(new URL(accessURL + ending));
499 resumptionSuccess = true;
500 }catch(Exception e) {
501 log.error("Seemed to fail for = " + accessURL + ending);
502 log.error("Exception: " + e.getMessage());
503 log.info("try another in case web server has not caught up");
504 failureCount++;
505 resumptionSuccess = false;
506 }
507 }
508 if(resumptionSuccess) {
509
510 ras.updateNoCheck(doc,version);
511
512
513
514
515
516
517
518
519
520
521
522
523
524 }else {
525 doc = null;
526 }
527 failureCount = 0;
528 resumptionSuccess = false;
529 }
530 }catch(ParserConfigurationException pce) {
531 pce.printStackTrace();
532 log.error(pce);
533 }catch(SAXException sax) {
534 sax.printStackTrace();
535 log.error(sax);
536 }catch(IOException ioe){
537 ioe.printStackTrace();
538 log.error(ioe);
539 }
540 }
541 log.info("exiting beginHarvest");
542 log.debug("end beginHarvest");
543 }
544
545 /***
546 * Method to establish a Service and a Call to the server side web service.
547 * @return Call object which has the necessary properties set for an Axis message style.
548 * @throws Exception
549 * @author Kevin Benson
550 */
551 private Call getCall(String endPoint) {
552 log.debug("start getCall");
553 Call _call = null;
554 try {
555 Service service = new Service();
556 _call = (Call) service.createCall();
557 _call.setTargetEndpointAddress(endPoint);
558 _call.setSOAPActionURI("");
559 _call.setOperationStyle(org.apache.axis.enum.Style.MESSAGE);
560 _call.setOperationUse(org.apache.axis.enum.Use.LITERAL);
561 _call.setEncodingStyle(null);
562 } catch(ServiceException se) {
563 se.printStackTrace();
564 log.error(se);
565 _call = null;
566 }finally {
567 log.debug("end getCall");
568 }
569 return _call;
570 }
571 }