View Javadoc

1   /*
2    * First created on 25-Apr-2003
3    *
4    */
5   package org.astrogrid.registry.server.harvest;
6   
7   import org.astrogrid.registry.server.XQueryExecution;
8   import java.rmi.RemoteException;
9   
10  
11  import java.io.IOException;
12  import org.xml.sax.SAXException;
13  import javax.xml.parsers.ParserConfigurationException;
14  import org.w3c.dom.Document;
15  import javax.xml.parsers.DocumentBuilderFactory;
16  import javax.xml.parsers.DocumentBuilder;
17  import org.w3c.dom.Element;
18  import org.w3c.dom.NodeList;
19  import org.w3c.dom.Node;
20  import org.w3c.dom.NamedNodeMap;
21  import org.xml.sax.InputSource;
22  import org.astrogrid.registry.server.RegistryServerHelper;
23  import org.astrogrid.registry.server.QueryHelper;
24  import org.astrogrid.registry.server.admin.RegistryAdminService;
25  import org.astrogrid.registry.server.query.RegistryQueryService;
26  import java.net.URL;
27  import java.io.Reader;
28  import java.io.StringReader;
29  import java.util.Date;
30  import java.text.SimpleDateFormat;
31  import java.util.HashMap;
32  import java.util.ArrayList;
33  import java.util.Vector;
34  import java.util.Hashtable;
35  
36  import org.apache.axis.client.Call;
37  import org.apache.axis.client.Service;
38  import org.apache.axis.message.SOAPBodyElement;
39  import org.apache.commons.logging.Log;
40  import org.apache.commons.logging.LogFactory;
41  import javax.xml.rpc.ServiceException;
42  import org.astrogrid.util.DomHelper;
43  import org.astrogrid.config.Config;
44  import org.astrogrid.registry.RegistryException;
45  
46  import org.astrogrid.registry.common.WSDLInformation;
47  import org.astrogrid.registry.common.WSDLBasicInformation;
48  import org.astrogrid.registry.common.XSLHelper;
49  
50  import java.net.MalformedURLException;
51  import org.apache.axis.AxisFault;
52  import org.astrogrid.xmldb.eXist.server.QueryDBService;
53  
54  /***
55   *
56   * RegistryHarvestService is no longer a web service class, but still posses the 
57   * harvesting mechanism that is used by server side servlets which uses
58   * automatic harvest mechanism and manual harvest by the user.
59   */
60  public class RegistryHarvestService {
61  
62     private static final Log log =
63                            LogFactory.getLog(RegistryHarvestService.class);
64     private static final String HARVEST_TEMPLATE_URL_PROPERTY =
65                            "org.astrogrid.registry.harvest.template.url";
66  
67     public static Config conf = null;
68  
69     static {
70        if(conf == null) {
71           conf = org.astrogrid.config.SimpleConfig.getSingleton();
72        }
73     }
74  
75  
76    /***
77      * Takes a Resource entry (a Registry type entry).  And performs a full replicate
78      * or harvest from that registry, to populate this registry.  Usually there is only one
79      * Registry Resource, but there might be more.
80      *
81      * @param query XML document object representing the query language used on the registry.
82      * @return null (nothing is returned on this web service operation).
83      * @author Kevin Benson
84      */
85     public Document harvestResource(Node resource,Date dt)  throws RegistryException, IOException {
86        log.debug("start harvestResource");
87        log.info("update harvestResource");
88  
89  
90        boolean harvestEnabled = conf.getBoolean("registry.harvest.enabled",false);
91        if(!harvestEnabled) {
92            return null;
93        }
94        //RegistryAdminService ras = new RegistryAdminService();
95  
96        //Change this up to look at the Node make sure it is a REgistryType or a Web service
97        //if not then throw an exception if so then try to call it.
98  
99        //Okay this is just a small xsl sheet to make sure the xml is formatted in
100       //a nice consistent way.  Because currently the schema espcially version 0.9
101       //allows the user to put the xml in a few different ways.
102 
103       //Okay update this one resource entry.
104       //ras.updateResource(resource);
105       
106       NodeList nl = null;
107       if(Node.DOCUMENT_NODE == resource.getNodeType()) {
108           nl = ((Document)resource).getElementsByTagNameNS("*","Resource");
109       }
110       else if(Node.ELEMENT_NODE == resource.getNodeType()) {
111           nl = ((Element)resource).getElementsByTagNameNS("*","Resource");
112       }
113       for(int i = 0; i < nl.getLength();i++) {
114           Element elem = (Element) nl.item(i);
115           /*
116           if(dt != null) {
117               //Document statDoc = qdb.getResource("statv"+versionNumber,RegistryServerHelper.getIdentifier(elem));
118               //String dateString = DomHelper.getNodeTextValue(statDoc,"StatsDateMillis");
119               //Date dt = new Date(Long.parseLong(dateString));
120               //harvestResource(elem,dt);
121               beginHarvest(elem,dt);
122           }else {
123               //harvestResource(elem,null);
124               beginHarvest(elem,null);
125           }//else
126           */   
127           //beginHarvest(elem,null,null);          
128       }
129       log.info("exiting harvestResource");
130       log.debug("end harvestResource");
131       return null;
132    }
133 
134    /***
135        * Will start a harvest of all the Registries known to this registry.
136        *
137        * @param resources XML document object representing the query language used on the registry.
138        * @return XML docuemnt object representing the result of the query.
139        * @author Kevin Benson
140        */
141    public void harvestAll(boolean onlyRegistries, boolean useDates) throws RegistryException  {
142       log.debug("start harvestAll");
143       Document harvestDoc = null;
144       String xqlQuery = null;
145       String ident = null;
146       onlyRegistries = true;
147       boolean harvestEnabled = conf.getBoolean("registry.harvest.enabled",false);
148       if(!harvestEnabled) {
149           return;
150       }
151 
152 
153       String versionNumber = null;
154       //String collectionName = "astrogridv" + versionNumber;
155       String collectionName = "";
156       QueryDBService qdb = new QueryDBService();
157       //instantiate the Admin service that contains the update methods.c
158       RegistryAdminService ras = new RegistryAdminService();
159       Document tempDoc = null;
160       try {
161           tempDoc = DomHelper.newDocument();
162           if(onlyRegistries) {
163              //query for all the Registry types which should be all of them with an xsi:type="RegistryType"
164              //xqlQuery = "declare namespace vr = \"http://www.ivoa.net/xml/VOResource/v0.9\"; //vr:Resource[@xsi:type='RegistryType']";
165              //System.out.println("The harvestDoc = " + DomHelper.DocumentToString(harvestDoc));
166              RegistryQueryService rqs = new RegistryQueryService();
167              ArrayList versions = rqs.getAstrogridVersions();
168              System.out.println("the number of versions = " + versions);
169              for(int k = 0;k < versions.size();k++) {
170                  try {
171                  System.out.println("begin work on version = " + (String)versions.get(k));
172                  harvestDoc = rqs.getRegistriesQuery((String)versions.get(k));
173                  //tempDoc.appendChild(
174                  //        tempDoc.importNode(harvestDoc.getDocumentElement(),true));
175                  //ras.updateResource(harvestDoc);
176                  ras.updateNoCheck(harvestDoc,(String)versions.get(k));                 
177 
178                  //log.info("try just the Resource");
179                  NodeList nl = harvestDoc.getElementsByTagNameNS("*","Resource");
180                  log.info("Harvest All found this number of resources = " + nl.getLength());
181                  for(int i = 0; i < nl.getLength();i++) {
182                    Element elem = (Element) nl.item(i);
183                    versionNumber = RegistryServerHelper.getRegistryVersionFromNode(elem);
184                    versionNumber = versionNumber.replace('.','_');               
185                    if(useDates) {
186                       String dateString = null;
187                       try {
188                           Document statDoc = qdb.getResource("statv"+versionNumber,RegistryServerHelper.getIdentifier(elem));
189                           dateString = DomHelper.getNodeTextValue(statDoc,"StatsDateMillis");
190                       }catch(Exception e) {
191                          log.warn("ignore for now: could not find a stat/date for element using no date.");
192                       }                      
193                       Date dt = null;
194                       if(dateString != null && dateString.trim().length() > 0) {
195                           dt = new Date(Long.parseLong(dateString));
196                       }
197                       //harvestResource(elem,dt);
198                       beginHarvest(elem,dt,(String)versions.get(k));
199                    }else {
200                     //harvestResource(elem,null);
201                     beginHarvest(elem,null,(String)versions.get(k));
202                    }//else
203                  }//for
204                  }catch(Exception e) {
205                      log.error("Found exception, but still need to harvest other versions:" + e.getMessage());
206                  }
207              }//for
208           }
209       }catch(ParserConfigurationException pce) {
210       	throw new RegistryException(pce);
211       }//catch(IOException ioe) {
212       	//throw new RegistryException(ioe);
213       //}
214    }
215 
216 /***
217  * Small Thread class to update the registry with a particular number
218  * of Resources.  This class inherits from the Thread class, so
219  * the harvesting can continue to keep harvesting more Resources.
220  * Some Registries require paging through the Resources hence, the
221  * multithreading helps performance.
222  *
223  *
224  */
225 private class HarvestThread extends Thread {
226 
227    private RegistryAdminService ras = null;
228    private Document updateDoc = null;
229 
230    /***
231     * HarvestThread class constructor.
232     * @param ras The RegistryAdminService class which does updates of Resources
233     * @param updateDoc a set of one or more Resources.
234     */
235    public HarvestThread(RegistryAdminService ras, Node updateNode) throws RegistryException {
236       this.ras = ras;
237       if(updateNode instanceof Element) {
238         try {
239       	updateDoc = DomHelper.newDocument();
240          updateDoc.appendChild(updateDoc.importNode(updateNode, true));
241         }catch(ParserConfigurationException pce) {
242         	throw new RegistryException(pce);
243         }
244       }else if(updateNode instanceof Document) {
245         this.updateDoc = (Document)updateNode;
246       }
247    }
248 
249    /***
250     * Begin the update, Calls updateNoCheck from RegistryAdminService, because
251     * it is assumed the Rewsources have been checked and valid and require no
252     * special checking.
253     */
254 
255    public void run() {
256 	  //Element el = updateDoc.getDocumentElement();
257       try {
258          ras.updateNoCheck(updateDoc,null);
259          //updateDoc = null;
260          //System.gc();
261   //       ras.Update(updateDoc);
262   //    }catch(MalformedURLException mue) {
263   //       mue.printStackTrace();
264       }catch(IOException ioe) {
265          ioe.printStackTrace();
266       }
267    }
268 
269 }
270 
271    /***
272     * This is the main method which uses the HarvestThread class to begin
273     * harvesting and updates.  This method will interrogate Resource entries
274     * and see how to call the Resources via the AccessURL and determine if
275     * it is a WebService or WebBrowser.  Then makes the appropriately call
276     * to the web service or browser grabbing there Resources and update into
277     * this Registry.
278     *
279     * @param dt An optional date used to harvest from a particular date
280     * @param resources Set of Resources to harvest on, normally a Registry Resource.
281     */
282    public void beginHarvest(Node resource, Date dt, String version)  throws RegistryException, IOException  {
283       log.debug("start beginHarvest");
284       log.info("entered beginharvest");
285       int failureCount = 0;
286       boolean resumptionSuccess = false;      
287       String accessURL = null;
288       String invocationType = null;
289       boolean isRegistryType;
290       Document doc = null;
291       NodeList nl = null;
292       String soapActionURI = null;
293       SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
294       int threadCount = 0;
295 
296       //instantiate the Admin service that contains the update methods.
297       RegistryAdminService ras = new RegistryAdminService();
298 
299       System.out.println(resource.getNodeName() + " " + resource.getNodeValue());
300 
301       NamedNodeMap attributes = resource.getAttributes();
302 
303       //get the accessurl and invocation type.
304       //invocationtype is either WebService or WebBrowser.
305       Node typeAttribute = resource.getAttributes().getNamedItem("xsi:type");
306       isRegistryType = (typeAttribute != null) &&
307                        (typeAttribute.getNodeValue().indexOf("Registry") >= 0);
308  //   System.out.println("RegistryType attribute =" + isRegistryType);
309 
310       nl = ((Element) resource).getElementsByTagNameNS("*","AccessURL");
311       if(nl.getLength() == 0) {
312           nl = ((Element) resource).getElementsByTagNameNS("*","accessURL");
313       }
314       if(nl.getLength() == 0) {
315           log.error("Error did not find a AccessURL");
316           throw new RegistryException("No accessURL found");
317       }
318       if(!nl.item(0).hasChildNodes()) {
319           log.error("Error did not find any text to the accessURL");
320           throw new RegistryException("No text found for the accessURL");          
321       }
322       accessURL = nl.item(0).getFirstChild().getNodeValue();
323 
324       nl = ((Element) resource).getElementsByTagNameNS("*","Invocation");
325       if(nl.getLength() == 0) {
326           //Need to look for interface here.
327           nl = ((Element) resource).getElementsByTagNameNS("*","interface");
328           if(nl.getLength() > 0) { 
329               typeAttribute = ((Element)nl.item(0)).getAttributes().getNamedItem("xsi:type");
330               invocationType = typeAttribute.getNodeValue();
331           }
332       } else {          
333           invocationType = nl.item(0).getFirstChild().getNodeValue();
334       }
335       
336       if(accessURL.indexOf("?wsdl") != -1) {
337           accessURL = accessURL.substring(0,accessURL.indexOf("?wsdl"));
338       }
339 
340 //    accessURL = DomHelper.getNodeTextValue((Element)resourceList.item(i),"AccessURL","vr");
341 //    invocationType = DomHelper.getNodeTextValue((Element)resourceList.item(i),"Invocation","vr");
342       log.info("The access URL = " + accessURL + " invocationType = " + invocationType);
343 //    System.out.println("The access URL = " + accessURL + " invocationType = " + invocationType);
344 
345 
346       if("WebService".endsWith(invocationType)) {
347          //call the service
348          //remember to look at the date
349          Element childElem = null;
350          Element root = null;
351 
352          if("?wsdl".indexOf(accessURL) == -1) {
353             accessURL += "?wsdl";
354          }
355          //Read in the wsdl for the endpoint and namespace
356          WSDLBasicInformation wsdlBasic = null;
357          try {
358             wsdlBasic = WSDLInformation.getBasicInformationFromURL(accessURL);
359          } catch(RegistryException re) {
360             re.printStackTrace();
361             log.error(re);
362          }
363          if(wsdlBasic != null) {
364             log.info("calling call obj with endpoint = " +
365                      (String)wsdlBasic.getEndPoint().values().iterator().next());
366             //create a call object
367             Call callObj = getCall((String)wsdlBasic.getEndPoint().
368                            values().iterator().next());
369 
370             try {
371                doc = DomHelper.newDocument();
372                //set the operation name/interface method to ListResources
373                String interfaceMethod = "getMetaData";
374                if(isRegistryType) interfaceMethod = "ListRecords";
375                String nameSpaceURI = WSDLInformation.
376                                      getNameSpaceFromBinding(
377 								        accessURL,interfaceMethod);
378                if(wsdlBasic.getEndPoint().keys().hasMoreElements()) {
379                    soapActionURI = wsdlBasic.getSoapActionURI(
380                      (String)wsdlBasic.getEndPoint().keys().nextElement() + 
381                      "_" + interfaceMethod);
382                }
383                if(soapActionURI != null) {
384                    callObj.setSOAPActionURI(soapActionURI);
385                }//if
386                log.info("SoapActionURI = " + soapActionURI);
387                root = doc.createElementNS(nameSpaceURI,interfaceMethod);
388                if(dt != null) {
389                   childElem = doc.createElement("from");
390                   childElem.appendChild(doc.createTextNode(sdf.format(dt)));
391                   root.appendChild(childElem);
392                }//if
393                doc.appendChild(root);
394                log.info("Creating soap request for operation name = " +
395                          interfaceMethod + " with namespaceuri = " +
396                          nameSpaceURI);
397 
398                SOAPBodyElement sbeRequest = new SOAPBodyElement(
399                                                 doc.getDocumentElement());
400                //sbeRequest.setName("harvestAll");
401                sbeRequest.setName(interfaceMethod);
402                sbeRequest.setNamespaceURI(wsdlBasic.getTargetNameSpace());
403                //invoke the web service call
404                log.info("Calling invoke on service");
405                Vector result = (Vector) callObj.invoke
406                                         (new Object[] {sbeRequest});
407                //Take the results and harvest.
408                if(result.size() > 0) {
409                    SOAPBodyElement sbe = (SOAPBodyElement) result.get(0);
410                    Document soapDoc = sbe.getAsDocument();
411                    log.info("SOAPDOC RETURNED = " + DomHelper.DocumentToString(soapDoc));
412                    //(new HarvestThread(ras,soapDoc.getDocumentElement())).start();
413                    ras.updateNoCheck(soapDoc,version);
414                    if(isRegistryType) {
415                       nl = DomHelper.getNodeListTags(soapDoc,"resumptionToken");
416                       while(nl.getLength() > 0) {
417                          Document resumeDoc = DomHelper.newDocument();
418                          root = doc.createElementNS(nameSpaceURI,"ListRecords");
419                           childElem = doc.createElement("resumptionToken");
420                           childElem.appendChild(doc.createTextNode(nl.item(0).getFirstChild().getNodeValue()));
421                           sbeRequest = new SOAPBodyElement(resumeDoc.getDocumentElement());
422                           sbeRequest.setName("ListRecords");
423                           sbeRequest.setNamespaceURI(wsdlBasic.getTargetNameSpace());
424                           //invoke the web service call
425                           result = (Vector) callObj.invoke
426     									    (new Object[] {sbeRequest});
427                           soapDoc = sbe.getAsDocument();
428                           //(new HarvestThread(ras,soapDoc.getDocumentElement().cloneNode(true))).start();
429                           ras.updateNoCheck(soapDoc,version);
430                            nl = DomHelper.getNodeListTags(soapDoc,"resumptionToken");
431                            threadCount++;                           
432                            if(threadCount > 19) {
433                                log.info("20 harvest threads have started recently, sleeping for 5 seconds. ");
434                                log.info("The activethread count = " + Thread.activeCount());
435                                try {
436                                    Thread.sleep(5000);
437                                }catch(InterruptedException ie) {
438                                    log.info("Possible interruption in the middle of harvest");
439                                }
440                                threadCount = 0;
441                            }//if
442                       }//while
443                    }//if
444                }//if
445             } catch(RemoteException re) {
446                 //log error
447                 re.printStackTrace();
448                 log.error(re);
449             }
450             catch(ParserConfigurationException pce) {
451                 pce.printStackTrace();
452                 log.error(pce);
453             }
454             catch(Exception e) {
455                 e.printStackTrace();
456                 log.error(e);
457             }
458          }
459       }else if("WebBrowser".endsWith(invocationType) || "Extended".endsWith(invocationType)) {
460          //its a web browser so assume for oai.
461          try {
462             String ending = "";
463             //might need to put some oai date stuff on the end.  This is
464             //unknown.
465             log.info("inside the web browser");
466 
467             if(accessURL.indexOf("?") == -1) {
468                ending = "?verb=ListRecords&metadataPrefix=ivo_vor"; //&from=" + date;
469                if(dt != null) {
470                   ending += "&from=" + sdf.format(dt);
471                }
472             }
473 
474             log.info("Grabbing Document from this url = " + accessURL + ending);
475             doc = DomHelper.newDocument(new URL(accessURL + ending));
476             log.info("Okay got this far to reading the url doc = " +
477                       DomHelper.DocumentToString(doc));
478             //(new HarvestThread(ras,doc.getDocumentElement().cloneNode(true))).start();
479             ras.updateNoCheck(doc,version);
480             NodeList moreTokens = null;
481             //log.info("resumptionToken length = " +
482             //         doc.getElementsByTagName("resumptionToken").
483             //         getLength());
484             //if there are more paging(next) then keep calling them.
485             while( doc != null && (moreTokens = doc.getElementsByTagName("resumptionToken")).
486                                      getLength() > 0 && moreTokens.item(0).hasChildNodes()) {
487                Node nd = moreTokens.item(0);
488                if(accessURL.indexOf("?") != -1) {
489                   accessURL = accessURL.substring(0,accessURL.indexOf("?"));
490                }
491                ending = "?verb=ListRecords&resumptionToken=" +
492                          nd.getFirstChild().getNodeValue();
493                log.info(
494                "the harvestcallregistry's with resumptionToken accessurl inside the token calls = " +
495                           accessURL + ending);
496                while(failureCount <= 2 && !resumptionSuccess) {
497                try {
498                    doc = DomHelper.newDocument(new URL(accessURL + ending));
499                    resumptionSuccess = true;
500                }catch(Exception e) {
501                    log.error("Seemed to fail for = " + accessURL + ending);
502                    log.error("Exception: " + e.getMessage());
503                    log.info("try another in case web server has not caught up");
504                    failureCount++;
505                    resumptionSuccess = false;
506                }
507                }//while
508                if(resumptionSuccess) {
509                    //(new HarvestThread(ras,doc)).start();
510                    ras.updateNoCheck(doc,version);
511                    /*
512                    threadCount++;                           
513                    if(threadCount > 6) {
514                        log.info("5 harvest threads have started recently, sleeping for 5 seconds. ");
515                        log.info("The activethread count = " + Thread.activeCount());
516                        try {
517                            Thread.sleep(5000);
518                        }catch(InterruptedException ie) {
519                            log.info("Possible interruption in the middle of harvest");
520                        }
521                        threadCount = 0;
522                    }//if
523                    */
524                }else {
525                    doc = null;
526                }//else
527                failureCount = 0;
528                resumptionSuccess = false;
529             }//while
530          }catch(ParserConfigurationException pce) {
531             pce.printStackTrace();
532             log.error(pce);
533          }catch(SAXException sax) {
534             sax.printStackTrace();
535             log.error(sax);
536          }catch(IOException ioe){
537             ioe.printStackTrace();
538             log.error(ioe);
539          }
540       }//elseif
541       log.info("exiting beginHarvest");
542       log.debug("end beginHarvest");
543    }//beginHarvest
544 
545    /***
546     * Method to establish a Service and a Call to the server side web service.
547     * @return Call object which has the necessary properties set for an Axis message style.
548     * @throws Exception
549     * @author Kevin Benson
550     */
551    private Call getCall(String endPoint) {
552       log.debug("start getCall");
553       Call _call = null;
554       try {
555          Service  service = new Service();
556          _call = (Call) service.createCall();
557          _call.setTargetEndpointAddress(endPoint);
558          _call.setSOAPActionURI("");
559          _call.setOperationStyle(org.apache.axis.enum.Style.MESSAGE);
560          _call.setOperationUse(org.apache.axis.enum.Use.LITERAL);
561          _call.setEncodingStyle(null);
562       } catch(ServiceException se) {
563          se.printStackTrace();
564          log.error(se);
565          _call = null;
566       }finally {
567          log.debug("end getCall");
568       }
569       return _call;
570    }//getCall
571 }