#!/usr/bin/env python #needs python 2.5 -- get hathihelper30.py if you have python 2.6 or 3.0 import xml.etree.ElementTree as ET from optparse import OptionParser import random import urllib2 import os import sys import time import heapq import logging import pprint import subprocess """The HathiTrust, http://www.hathitrust.org/, provides access to hundreds of thousands of public domain books that have been digitized by Google/University partnership or smaller independent scanning efforts. Much of the content is also available on Google Books, but the operators of the HathiTrust are more aggressive in clearing material for the public domain. This means you can access the full text/page images for some works from HathiTrust when the corresponding Google Books entry does not offer useful access. The HathiTrust does not yet offer an easy way to download entire volumes. The current book viewer application offers downloads of 10 page PDF chunks, at best, and these chunks do not appear to offer full image resolution. With hathihelper.py you should be able to easily check the metadata of a volume in the HathiTrust system, and download all of its page images and OCR text if the volume is in the public domain. This program has not been tested outside the United States; some material that is freely available to US users may be restricted from download elsewhere due to differing national copyright laws. The HathiTrust APIs are not finalized so this program could break if the APIs are changed. Check back at http://www.sciencemadness.org/library/hathi/ for more information and any future updates. """ __version__ = "0.1" __license__ = "GPL v2" #If on a Unix-like system and ImageMagick is installed, use 'identify' to #perform more robust validation on image content try: identify_proc = subprocess.Popen(['identify', '-help'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) messages = ''.join(identify_proc.communicate()) if 'Magick' in messages: identify = True else: identify = False except WindowsError: identify = False LOGFILE = 'hathi.log' logging.basicConfig(filename=LOGFILE, level=logging.DEBUG) def download(url, filename, validator=None, overwrite=False): """Download resource identified by url and store it in filename. If validator is not None, run validator against the stored data to check its integrity. If overwrite is True, redownload content that is already stored to disk.""" #Simple downloader based on urllib2; more robust versions possible #with PycURL or wrapped wget. status = {'code':None, 'bytes_stored':0, 'content-length':None, 'content-type':None, 'validated':False, 'error':None, 'filename':filename, 'retries':0, 'elapsed':0.0} #Once the file is saved to disk, validated=True until/unless the #validator method overrides it if overwrite or not os.path.exists(filename): try: url_handle = urllib2.urlopen(url) status['content-type'] = url_handle.headers.get('content-type') status['content-length'] = url_handle.headers.get('content-length') status['code'] = url_handle.code data = url_handle.read() open(filename, 'wb').write(data) status['bytes_stored'] = len(data) status['validated'] = bool(data) if not data: status['error'] = 'No data downloaded.' except Exception, e: status['error'] = '%s %s' % (Exception, e) else: status['bytes_stored'] = os.path.getsize(filename) status['validated'] = True if validator: validation = validator(filename, status) status['validated'] = validation return status class HDocument(object): """HDocument represents a HathiTrust volume as it is described by its XML metadata file""" def __init__(self, identifier, metadata_file, metadata_url=None): self.amap = {'restricted':'authentication and authorization required', 'limited':'access to public domain items is open but may be rate-limited', 'open':'no restrictions', None:'information missing'} self.identifier = identifier self.urls_to_files = {} self.docattrs = {} self.parse_metadata(metadata_file) def unprefix_tag(self, tag): """Removes the distracting namespace prefix used by ElementTree so tags are easier to read""" return tag.rsplit('}', 1)[-1] def tagget_etree(self, etree): """Skim off the top level of etree tag/text relations and put them in a dict as key/value entries""" D = {} for e in etree: D[self.unprefix_tag(e.tag)] = e.text return D def parse_metadata(self, mfile): """Extract all the parts of the metadata that are likely useful to humans or further data retrieval""" tree = ET.ElementTree() tree.parse(mfile) for e in tree.getroot(): if e.tag.endswith('access'): self.set_resource_access(e) elif e.tag.endswith('version'): self.docattrs['version'] = e.text elif e.tag.endswith('rights'): self.docattrs['rights'] = self.tagget_etree(e) elif e.tag.endswith('seqmap'): self.set_seqmap(e) def set_seqmap(self, e): """Store the sequence number, page name, page image content type information found for each page image in the metadata seqmap""" seqmap = [] for seq in e: attrs = self.tagget_etree(seq) attrs.update(seq.attrib) seqmap.append(attrs) self.docattrs['seqmap'] = seqmap def set_resource_access(self, e): """Store the access status information for page images, page OCR, and aggregate data""" key = e.get('resource') value = e.text.split('#')[-1] try: self.docattrs['resources'][key] = value except KeyError: self.docattrs['resources'] = {key:value} def describe_textual(self): """Show the parts of the metadata that are likely to interest humans""" #TODO: add describe_html to build a browseable index page for volume print "\n" print "Identifier: %s" % self.identifier print "Number of pages: %i" % len(self.docattrs['seqmap']) pageocr = self.docattrs['resources'].get('pageocr') print "Page OCR text access: %s -- %s" % (pageocr, self.amap[pageocr]) pageimg = self.docattrs['resources'].get('pageimage') print "Page image access: %s -- %s" % (pageimg, self.amap[pageimg]) aggr = self.docattrs['resources'].get('aggregate') print "Aggregate data access: %s -- %s" % (aggr, self.amap[aggr]) print "Rights: these attributes are not all documented by HathiTrust:" pprint.pprint(self.docattrs['rights']) class HFetch(object): """HFetch retrieves document information about a volume and handles downloads of page images and OCR text when requested""" def __init__(self, volume, refresh=False, download_retries=3, download_wait=30, verbose=True, imgdir='images', textdir='texts'): self.imgdir = volume + '/' + imgdir self.textdir = volume + '/' + textdir self.verbose = verbose self.logger = logging.getLogger("HFetch") self.volume = volume self.refresh = refresh self.download_retries = download_retries self.download_wait = download_wait self.image_url_template = "http://services.hathitrust.org/api/htd/pageimage/%(volume)s/%(seqnum)i" self.text_url_template = "http://services.hathitrust.org/api/htd/pageocr/%(volume)s/%(seqnum)i" self.downloads = {} self.download_metadata(volume) self.log('info', 'init finished for volume %s' % volume) def log(self, level, msg): """Store a message with a specified logging level to the activity log and display it in the terminal if verbose""" if self.verbose: print '%s %s: %s' % (self.logger.name, level, msg) logmethod = getattr(self.logger, level) logmethod(msg) def show_download_stats(self): """Log some mildly interesting statistics about cumulative download data transfer, time, and retries""" elapsed = 0.0 retries = 0 bytes = 0 for download in self.downloads.values(): retries += download['retries'] bytes += download['bytes_stored'] elapsed += download['elapsed'] self.log('info', "Finished downloading %s" % self.volume) self.log('info', "Bytes stored: %i" % bytes) self.log('info', "Time elapsed (seconds): %.1f" % elapsed) self.log('info', "Download retries: %i" % retries) def download_metadata(self, volume): """Retrieve the XML metadata for a volume from the HathiTrust API""" template = "http://services.hathitrust.org/api/htd/meta/%s" url = template % volume filename = volume + '.xml' if os.path.exists(filename): if os.path.getsize(filename) == 0: os.unlink(filename) def validator(filename, status): ok = not(status['error']) if not ok: print "Problem downloading metadata: %s" % status['error'] return ok validated = self.download(url, filename, validator=validator) if not validated: self.log('error', 'Unable to get metadata for %s' % volume) sys.exit(1) self.document = HDocument(volume, filename, metadata_url=url) def download(self, url, filename, validator=None): """Download a file to disk and optionally validate it""" t0 = time.time() status = download(url, filename, validator=validator, overwrite=self.refresh) elapsed = time.time() - t0 status['elapsed'] = elapsed self.downloads[url] = status return status['validated'] def make_text_validator(self, content_type): """For now, return a validator that always approves the text.""" def validator(filename, status): return True return validator def make_image_validator(self, content_type): """Build and return a validation method for images based on the expected content type and availability of ImageMagick""" #ImageMagick's JPEG2000 support is built on JasPer, which does #not seem to properly interpret all JP2 files from HT. Verify #only TIFF and standard JPEG with 'identify'. if identify and content_type in ('image/tiff', 'image/jpg'): def validator(filename, status): identify_proc = subprocess.Popen(['identify', filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE) #a successful run should not produce any stderr messages #other than perhaps a warning about TIFF tags (out, err) = identify_proc.communicate() if err and 'unknown field with tag' not in err: print "Image verification error: %s" % err return False else: return True else: def validator(filename, status): return not(status['error']) return validator def make_content_name(self, content_type, pseq, pname, storagedir, extension=None): """Generate name for a file based on content-type, sequence number, page name, and storage location. File extension can also be supplied explicitly instead of derived from content_type.""" if extension: ext = extension else: ext = content_type.split('/')[-1] pseq = str(pseq).zfill(4) pnum = str(pname).zfill(4) name = '%s/%s_%s.%s' % (storagedir, pseq, pnum, ext) return name def download_images(self): """Retrieve and verify all the page images defined in the volume""" self.log('info', "Beginning download of %i page images" % len(self.document.docattrs['seqmap'])) self.download_pages(self.imgdir, self.image_url_template, self.make_image_validator, 'pageimage') def download_texts(self): """Retrieve and verify all the OCR text for each page defined in the volume""" self.log('info', "Beginning download of OCR text from %i pages" % len(self.document.docattrs['seqmap'])) self.download_pages(self.textdir, self.text_url_template, self.make_text_validator, 'pageocr', extension='txt') def download_pages(self, storagedir, url_template, validator_creator, datatype, extension=None): """Retrieve and verify all the page images or OCR texts defined in the volume""" if not os.path.exists(storagedir): try: os.makedirs(storagedir) except Exception, e: sys.stderr.write("Unable to create directory %s: %s %s\n" % (voldir, Exception, e)) sys.exit(1) targets = [] for seq in self.document.docattrs['seqmap']: filename = self.make_content_name(seq['imgfmt'], seq['pseq'], seq['pnum'], storagedir, extension=extension) filenum = int(seq['pseq']) url = url_template % {'seqnum':filenum, 'volume':self.volume} validator = validator_creator(seq['imgfmt']) targets.append((url, filename, validator)) page_access = self.document.docattrs['rights'].get(datatype) if page_access == 'restricted': self.log('warn', 'Page access is restricted. Downloads will probably fail. Continuing anyway.') self.queue_download(targets) def queue_download(self, targets, randomize=False): """Download each target (url, filename, validator) tuple in targets via priority queue. Failed downloads reenter the queue with lower priority (higher number) until self.download_retries attempts have passed. If randomize=True then files will be shuffled before download.""" download_targets = list(targets) if randomize: random.shuffle(download_targets) queue = [] for url, filename, validator in download_targets: heapq.heappush(queue, (0, url, filename, validator)) while queue: retries, url, filename, validator = heapq.heappop(queue) self.log('info', 'Attempting to retrieve %s' % filename) success = self.download(url, filename, validator=validator) if not success: error = self.downloads[url]['error'] self.log('warn', "Couldn't verify %s: %s" % (filename, error)) delay = self.download_wait * 2 ** retries self.log('warn', 'Attempt %i. Delaying %i seconds.' % (retries, delay)) time.sleep(delay) retries += 1 if retries > self.download_retries: self.log('error', 'Maximum number of download attempts passed for %s -- %s. Giving up.' % (filename, url)) else: if os.path.exists(filename): os.unlink(filename) self.downloads[url]['retries'] = retries heapq.heappush(queue, (retries, url, filename, validator)) def main(): parser = OptionParser() parser.add_option('-i', '--identifier', action='store', type='string', dest='identifier', default=None, help='Volume identifier (example: miun.abr0732.0001.001). This parameter is required.') parser.add_option('-r', '--refresh', action='store_true', dest='refresh', default=False, help='Download files again even if they are already saved to disk.') parser.add_option('-m', '--metadata-only', action='store_true', dest='metadata_only', default=False, help='Only get metadata. Do not retrieve page images or OCR text.') (options, parsedargs) = parser.parse_args() if not options.identifier: sys.stderr.write("You must supply a volume identifier. Run %s -h for help.\n" % sys.argv[0]) sys.exit(1) fetcher = HFetch(options.identifier, refresh=options.refresh) fetcher.document.describe_textual() if not options.metadata_only: fetcher.download_images() fetcher.download_texts() fetcher.show_download_stats() if __name__ == '__main__': main()