/*
* oaipmhRequest.java
*
* Created on Sept 26, 2006, 10:35 PM
*
* To change this template, choose Tools | Options and locate the template under
* the Source Creation and Management node. Right-click the template and choose
* Open. You can then make changes to the template in the Source Editor.
*
*Sample verb requests:
*
* ListIdentifiers "&metadataPrefix=cr_unixml&set=10.1007:58311:2006" listids
* ListIdentifiers "&metadataPrefix=cr_unixml&from=2007-01-29" listids
* ListRecords "&metadataPrefix=cr_unixml&set=10.1155:50202:2006" hindawi_msn_listRecord
*
*/
import java.sql.*;
import sun.net.ftp.*;
import javax.mail.*;
import javax.mail.internet.*;
import javax.activation.*;
import java.io.*;
import java.util.*;
import java.text.*;
import java.util.Date;
import java.util.regex.*;
import java.util.Properties;
import java.net.*;
import java.net.URLEncoder.*;
import java.lang.*;
import java.lang.Math.*;
import crtools.crutils.*;
//import net.handle.hdllib.*;
import HTTPClient.*;
import org.xml.sax.*;
import javax.xml.parsers.*;
import org.xml.sax.helpers.DefaultHandler;
import org.apache.xerces.parsers.DOMParser;
/**
*
* @author Jon Stark
*/
public class oaipmhRequest {
static boolean isWin = false;
static boolean save_files = false;
static Calendar calTime = null;
static long globTime = 0;
// static List globVals = new ArrayList();
static List verbVals = new ArrayList();
static String lastCommand = "";
static String lastPage = "";
static String lastFile = "";
static int errCount1 = 0;
static int errCount2 = 0;
static int doiCount;
static int numRequests;
static String page;
static private void pauseUntilResponse(){
System.out.print("Please press Enter when you'return ready to resume? ");
// open up standard input
BufferedReader br = new BufferedReader(new InputStreamReader(System.in));
String response = null;
// read the username from the command-line; need to use try/catch with the
// readLine() method
try {
response = br.readLine();
} catch (IOException ioe) {
System.out.println("\nIO error trying to read your response!");
System.exit(1);
}
return;
}
static private void logStats(DataOutputStream log, List times){
long min = 0;
long max = 0;
long avg = 0;
long total = 0;
long count = 0;
try{
Iterator it = times.iterator();
while(it.hasNext()){
String valStr = it.next().toString();
long val = new Long(valStr).longValue();
if(min == 0 || min > val){
min = val;
}
if(max == 0 || max < val){
max = val;
}
total += val;
count++;
}
avg = total/count;
log.writeBytes("Total time for verb with resumptions: "+formatTime(total)+
"\nMinimum time for query: "+formatTime(min)+
"\nMaximum time for query: "+formatTime(max)+
"\nAverage for verb: "+formatTime(avg)+"\n");
} catch (Exception e) {
System.out.println(e.toString());
System.err.println(e.getMessage());
e.printStackTrace();
}
}
static private String getCurrTime(){
String timestamp = "";
try{
DecimalFormat zeropad = new DecimalFormat("00");
Calendar cal = new GregorianCalendar();
int iyear = cal.get(Calendar.YEAR); // 2002
int month = cal.get(Calendar.MONTH); // 0=Jan, 1=Feb, ...
month += 1;
int day = cal.get(Calendar.DAY_OF_MONTH); // 1...
int hour = cal.get(Calendar.HOUR_OF_DAY); // 1...
int min = cal.get(Calendar.MINUTE); // 1...
int sec = cal.get(Calendar.SECOND); // 1...
timestamp = ""+iyear+zeropad.format(month)+zeropad.format(day)+" "+zeropad.format(hour)+":"+zeropad.format(min)+":"+zeropad.format(sec)+"";
} catch (Exception e) {
System.out.println(e.toString());
System.err.println(e.getMessage());
e.printStackTrace();
}
return(timestamp);
}
static private String formatTime(long mSeconds){
String result = "";
try{
DecimalFormat df = new DecimalFormat("00.000");
DecimalFormat zeropad = new DecimalFormat("00");
// DecimalFormat zeropad3 = new DecimalFormat("000");
long msecInHour = 1000*60*60;
long msecInMin = 1000*60;
long hours = mSeconds/msecInHour;
long rem = mSeconds-(hours*msecInHour);
long minutes = rem/msecInMin;
rem = rem - (minutes*msecInMin);
float secs = rem/1000F;
result = ""+zeropad.format(hours)+":"+zeropad.format(minutes)+":"+df.format(secs);
} catch (Exception e) {
System.out.println(e.toString());
System.err.println(e.getMessage());
e.printStackTrace();
}
return(result);
}
static private long getElapsed(){
long mSeconds = 0;
try{
calTime = new GregorianCalendar();
Date d1 = calTime.getTime();
long currTime = calTime.getTimeInMillis();
long elapsedTime = currTime-globTime;
mSeconds = elapsedTime;
globTime = currTime;
} catch (Exception e) {
System.out.println(e.toString());
System.err.println(e.getMessage());
e.printStackTrace();
}
return(mSeconds);
}
static private void startTime(){
try{
calTime = new GregorianCalendar();
Date d1 = calTime.getTime();
globTime = calTime.getTimeInMillis();
} catch (Exception e) {
System.out.println(e.toString());
System.err.println(e.getMessage());
e.printStackTrace();
}
}
static boolean parseError = false;
// ====================================================================================================
static class FullParser extends DefaultHandler {
private String content="";
protected static final String DEFAULT_PARSER_NAME = "org.apache.xerces.parsers.SAXParser";
public FullParser() {}
public void endElement(String uri, String localName, String qName)
throws SAXException
{
}
public void startElement(String uri, String local, String raw,
Attributes attrs) throws SAXException {
int attrCount = attrs.getLength();
content = ""; //=== always clear the content variable, in case we do collect content from this tag
if (local.equals("record") )
{
doiCount++;
}
}
public void characters(char ch[], int start, int length)
throws SAXException {
StringBuffer charData = new StringBuffer();
charData.append(ch, start, length);
content = content + charData.toString();
}
public void ignorableWhitespace(char ch[], int start, int length)
throws SAXException {
StringBuffer charData = new StringBuffer();
charData.append(ch, start, length);
int index;
}
public void warning(SAXParseException ex) throws SAXException
{ parseError=true;
printError("Warning", ex);
} // warning(SAXParseException)
public void error(SAXParseException ex) throws SAXException
{
parseError=true;
printError("Error", ex);
} // error(SAXParseException)
public void fatalError(SAXParseException ex) throws SAXException
{
parseError=true;
printError("Fatal Error", ex);
} // fatalError(SAXParseException)
protected void printError(String type, SAXParseException ex) {
System.err.print("[");
System.err.print(type);
System.err.print("] ");
if (ex== null) {
System.out.println("!!!");
}
String systemId = ex.getSystemId();
if (systemId != null) {
int index = systemId.lastIndexOf('/');
if (index != -1)
systemId = systemId.substring(index + 1);
System.err.print(systemId);
}
System.err.print(':');
System.err.print(ex.getLineNumber());
System.err.print(':');
System.err.print(ex.getColumnNumber());
System.err.print(": ");
System.err.print(ex.getMessage());
System.err.println();
System.err.flush();
}
}
// ====================================================================================================
static class SAXError implements ErrorHandler
{
public void warning(SAXParseException exception) {
parseError=true;
System.out.print("Warning: " + exception.getMessage());
System.out.print(" at line " + exception.getLineNumber()
+ ", column " + exception.getColumnNumber());
System.out.print (" in request " + numRequests +"\n");
String badDOI;
String problem = page.substring(0, exception.getColumnNumber());
Pattern pattern = Pattern.compile("(?ism).*(.*?).*?$");
Matcher matcher = pattern.matcher(problem);
if(matcher.find()){
badDOI = matcher.group(1);
System.out.print(" id:"+badDOI +"\n");
}
}
public void error(SAXParseException exception) {
parseError=true;
System.out.print("Error: " + exception.getMessage());
System.out.print(" at line " + exception.getLineNumber()
+ ", column " + exception.getColumnNumber());
System.out.print (" in request " + numRequests +"\n");
String badDOI;
String problem = page.substring(0, exception.getColumnNumber());
Pattern pattern = Pattern.compile("(?ism).*(.*?).*?$");
Matcher matcher = pattern.matcher(problem);
if(matcher.find()){
badDOI = matcher.group(1);
System.out.print(" id:"+badDOI +"\n");
}
}
public void fatalError(SAXParseException exception) {
parseError=true;
System.out.print("Fatal: " + exception.getMessage());
System.out.print(" at line " + exception.getLineNumber() + ", column " + exception.getColumnNumber());
System.out.print (" in request " + numRequests +"\n");
String badDOI;
String problem = page.substring(0, exception.getColumnNumber());
Pattern pattern = Pattern.compile("(?ism).*(.*?).*?$");
Matcher matcher = pattern.matcher(problem);
if(matcher.find()){
badDOI = matcher.group(1);
System.out.print(" id:"+badDOI +"\n");
}
}
}
// ====================================================================================================
// ====================================================================================================
static private int testUrl(String verb_root, String verb_args, String fname, String fname_prefix )
{
numRequests = 1;
doiCount=0;
SAXParserFactory factory;
SAXParser parser;
FullParser fParser = new FullParser();
try
{
factory = SAXParserFactory.newInstance();
factory.setNamespaceAware(true);
parser = factory.newSAXParser();
String URL = "OAIHandler?verb=";
String result = "Error getting URL: "+URL;
String pattStr = "(?ism)^http:\\/\\/([^\\/]+?)(\\/.*)$";
Pattern pattern = Pattern.compile(pattStr);
Matcher matcher = pattern.matcher(URL);
int count = 0;
page = "";
String host = "172.20.1.21";
int port = 8080;
String rToken = "(?ism)(.*?)<\\/resumptionToken>";
Pattern rTokenPatt = Pattern.compile(rToken);
Matcher tokenMatcher;
String errPattStr = "(?ism)(.*?)<\\/resumptionToken>";
Pattern errPatt = Pattern.compile(errPattStr);
Matcher errMatcher;
String errPattStr2 = "(?ism)";
Pattern errPatt2 = Pattern.compile(errPattStr2);
Matcher errMatcher2;
PrintWriter xmlFile;
try
{
HTTPClient.HTTPResponse httpResp = null; // set up response object
HTTPClient.HTTPConnection httpConn = new HTTPClient.HTTPConnection(host,port); // set up connection object
HTTPClient.CookieModule.setCookiePolicyHandler(null); // automatically accept all cookies
String resumptionToken = "start";
while(resumptionToken.length() > 0)
{
if(resumptionToken.equals("start"))
{
resumptionToken = "";
httpResp = httpConn.Get(URL + verb_root + verb_args);
}
else
{
httpResp = httpConn.Get(URL + verb_root + "&resumptionToken="+resumptionToken);
}
resumptionToken=""; // clear the resumption token so we don't ask for it again
if(httpResp != null)
{
int iGetResultCode = httpResp.getStatusCode();
if(iGetResultCode < 300)
{
page = httpResp.getText();
tokenMatcher = rTokenPatt.matcher(page);
errMatcher = errPatt.matcher(page);
errMatcher2 = errPatt2.matcher(page);
if((errMatcher.find()) || (errMatcher2.find()))
{System.out.println("Errors in file: " + numRequests); parseError=true;}
if(tokenMatcher.find())
{
resumptionToken = tokenMatcher.group(1).trim();
}
try // ===== see if it parses
{
byte[] xmlB=httpResp.getData();
ByteArrayInputStream xmlByteIS = new ByteArrayInputStream(xmlB);
InputSource inXML = new InputSource(xmlByteIS);
parseError=false;
parser.parse(xmlByteIS, fParser);
if (parseError) // non fatal errors will go here
{
if(!fname.equals(""))
{ //==== if there is a directory path save the file
File delF = new File(fname+"/oai_"+(numRequests)+"_parse_error.xml");
delF.delete();
xmlFile = new PrintWriter (new OutputStreamWriter (new FileOutputStream (fname+"/" + verb_root + fname_prefix + "_" +(numRequests)+".xml",true),"UTF-8"));
xmlFile.write(page);
xmlFile.close();
}
}
else
{
if(!fname.equals("") && save_files)
{ //==== if there is a directory path save the file
File delF = new File(fname+"/oai_"+(numRequests)+".xml");
delF.delete();
xmlFile = new PrintWriter (new OutputStreamWriter (new FileOutputStream (fname+"/" + verb_root + fname_prefix + "_" +(numRequests)+".xml",true),"UTF-8"));
xmlFile.write(page);
xmlFile.close();
}
}
}
catch (Exception e) // fatal parse errors throw
{
if(!fname.equals(""))
{ //==== if there is a directory path save the file
File delF = new File(fname+"/oai_"+(numRequests)+".xml");
delF.delete();
xmlFile = new PrintWriter (new OutputStreamWriter (new FileOutputStream (fname+"/" + verb_root + "_" +(numRequests)+".xml",true),"UTF-8"));
xmlFile.write(page);
xmlFile.close();
}
}
}
else
System.out.println("HTTP Error :" + httpResp);
}
numRequests++;
}
} catch (Exception e) {
System.out.println(e.toString());
System.err.println(e.getMessage());
String trace = "";
try{
StackTraceElement [] ems = e.getStackTrace();
for(int x=0; x < ems.length; x++){
trace += ems[x].toString()+"
\n";
}
} catch(Exception e2){};
e.printStackTrace();
}
}
catch (Exception e)
{
System.out.println("=== Exception creating parser");
e.printStackTrace();
}
return(numRequests);
}
// =========================================================================
static private ArrayList getSetData(String verb_root, String prefix)
{
ArrayList setInfo = new ArrayList();
String URL = "OAIHandler?verb=";
String host = "oai.crossref.org";
int port = 80;
String rToken = "(?ism)(.*?)<\\/resumptionToken>";
Pattern rTokenPatt = Pattern.compile(rToken);
Matcher tokenMatcher;
try
{
HTTPClient.HTTPResponse httpResp = null; // set up response object
HTTPClient.HTTPConnection httpConn = new HTTPClient.HTTPConnection(host,port); // set up connection object
HTTPClient.CookieModule.setCookiePolicyHandler(null); // automatically accept all cookies
String resumptionToken = "start";
while(resumptionToken.length() > 0)
{
if(resumptionToken.equals("start"))
{
resumptionToken = "";
httpResp = httpConn.Get(URL + verb_root);
}
else
{
httpResp = httpConn.Get(URL + verb_root + "&resumptionToken="+resumptionToken);
}
resumptionToken=""; // clear the resumption token so we don't ask for it again
if(httpResp != null)
{
int iGetResultCode = httpResp.getStatusCode();
if(iGetResultCode < 300)
{
page = httpResp.getText();
tokenMatcher = rTokenPatt.matcher(page);
if(tokenMatcher.find())
{
resumptionToken = tokenMatcher.group(1).trim();
}
//===== now get a list of all the second tier sets for this prefix
String setToken = "(?ism)"+prefix+":(\\d*?)<\\/setSpec>";
Pattern setTokenPatt = Pattern.compile(setToken);
Matcher setMatcher;
setMatcher = setTokenPatt.matcher(page);
while(setMatcher.find())
{
setInfo.add(prefix+":"+setMatcher.group(1));
}
}
else
System.out.println("HTTP Error :" + httpResp);
}
numRequests++;
}
}
catch (Exception e)
{
System.out.println(e.toString());
System.err.println(e.getMessage());
String trace = "";
try
{
StackTraceElement [] ems = e.getStackTrace();
for(int x=0; x < ems.length; x++){
trace += ems[x].toString()+"
\n";
}
} catch(Exception e2){};
e.printStackTrace();
}
return setInfo;
}
// =========================================================================
static public void main(String[] argv) {
if (argv.length != 4) {
System.out.println("Usage: java oaiphmRequest verb verb-args output-fname-root save-files[true|false]");
System.out.println(" This program will make a OAI request and save the output");
System.out.println(" to a set of files in the output directory if save-files=true");
System.out.println(" (parse error files are always saved). Argument verb-args shoud be in quotes:\n");
System.out.println(" Example: java oaipmhRequest ListRecords \"&set=10.1007&metadataPrefix=cr_unixml\" /data_root/oai/10.1007 true\n");
System.out.println(" It also supports a non-standard OAI request called GetSets which");
System.out.println(" first performs a ListSets and then a ListRecords on each top level set.");
System.out.println(" This has the advantage of breaking a single large request into serveral ");
System.out.println(" smaller requests. For this case set verb-args to the prefix to be used.");
return;
}
String verb_root = argv[0];
String verb_args = argv[1];
String fname = argv[2];
String save = argv[3];
ArrayList setS;
if (save != null && save.equals("true")) save_files=true;
if(fname == null)
fname="";
else
System.out.println("Saving data to: " + fname);
if (verb_root.equals("GetSets"))
{
//=== force verb_root to be ListSets, and verb_arg will contain the prefix of the data we're after
setS = getSetData("ListSets", verb_args);
System.out.println("Number of sets for " + verb_args + "=" + setS.size());
for (int i=0;i