| 500 | | def do_src_suff(html, _dir): # change src urls, download and put them in zip |
|---|
| 501 | | l = sre.split('<img\s+src="(.*?)"', html) |
|---|
| 502 | | |
|---|
| 503 | | from pprint import pprint |
|---|
| 504 | | imgs = {} |
|---|
| 505 | | newnames = {} |
|---|
| 506 | | img_urls = l[1::2] |
|---|
| 507 | | for url in img_urls: |
|---|
| 508 | | if not imgs.has_key(url): |
|---|
| 509 | | xurl = url |
|---|
| 510 | | if not url.startswith('http://'): |
|---|
| 511 | | xurl = self.absolute_url() + '/' + url |
|---|
| 512 | | print xurl |
|---|
| 513 | | try: |
|---|
| 514 | | img = urlopen(xurl) |
|---|
| 515 | | media_type, subtype = img.headers.getheader('Content-Type').split('/') |
|---|
| 516 | | if media_type == 'image': |
|---|
| 517 | | tmp_url = img.url |
|---|
| 518 | | if img.url.startswith(self.portal_url()): |
|---|
| 519 | | for s in ('/coverImage', '/image_large'): |
|---|
| 520 | | i = img.url.find(s) |
|---|
| 521 | | if i != -1: |
|---|
| 522 | | tmp_url = list(img.url) |
|---|
| 523 | | tmp_url[i] = '_' |
|---|
| 524 | | tmp_url = ''.join(tmp_url[:i + len(s)]) |
|---|
| 525 | | break |
|---|
| 526 | | basename = tmp_url.split('/')[-1].rsplit('.', 1)[0] |
|---|
| 527 | | filename = '%s.%s' % (basename, subtype) |
|---|
| 528 | | newnames[filename] = newnames.get(filename, 0) + 1 |
|---|
| 529 | | if newnames[filename] > 1: |
|---|
| 530 | | filename = '%s-%d.%s' % (basename, newnames[filename] - 1, subtype) |
|---|
| 531 | | imgs[url] = (filename, img.read()) |
|---|
| 532 | | except HTTPError, e: |
|---|
| 533 | | print "Failed to download %s: %s" % (xurl, e) |
|---|
| 534 | | |
|---|
| 535 | | pprint([(x, y) for x, (y, z) in imgs.iteritems()]) |
|---|
| 536 | | |
|---|
| 537 | | if _dir[-1] == '/': |
|---|
| 538 | | _dir = _dir[:-1] |
|---|
| 539 | | for oldname, (newname, content) in imgs.iteritems(): |
|---|
| | 500 | def do_stuff(html, _dir): |
|---|
| | 501 | def get_absolute_url(url): |
|---|
| | 502 | print '//// relative url:', url |
|---|
| | 503 | try: |
|---|
| | 504 | xurl = base + '/' + url |
|---|
| | 505 | src = urlopen(xurl) |
|---|
| | 506 | except HTTPError: |
|---|
| | 507 | print '----', xurl |
|---|
| | 508 | xurl = self.absolute_url() + '/' + url |
|---|
| | 509 | print '||||', xurl |
|---|
| | 510 | src = urlopen(xurl) |
|---|
| | 511 | return xurl, src |
|---|
| | 512 | |
|---|
| | 513 | def _do_src_suff(html): # change src urls, download and put them in zip |
|---|
| | 514 | l = sre.split('(<[^>]+)src="(.*?)"', html) |
|---|
| | 515 | |
|---|
| | 516 | srcs = {} |
|---|
| | 517 | newnames = {} |
|---|
| | 518 | src_urls = l[2::3] |
|---|
| | 519 | for url in src_urls: |
|---|
| | 520 | if not srcs.has_key(url): |
|---|
| | 521 | xurl = url |
|---|
| | 522 | try: |
|---|
| | 523 | if not url.startswith('http://'): |
|---|
| | 524 | xurl, src = get_absolute_url(url) |
|---|
| | 525 | else: |
|---|
| | 526 | src = urlopen(xurl) |
|---|
| | 527 | print xurl |
|---|
| | 528 | media_type, subtype = src.headers.getheader('Content-Type').split('/') |
|---|
| | 529 | if media_type == 'image': |
|---|
| | 530 | tmp_url = src.url |
|---|
| | 531 | if src.url.startswith(self.portal_url()): |
|---|
| | 532 | for s in ('/coverImage', '/image_large'): |
|---|
| | 533 | i = src.url.find(s) |
|---|
| | 534 | if i != -1: |
|---|
| | 535 | tmp_url = list(src.url) |
|---|
| | 536 | tmp_url[i] = '_' |
|---|
| | 537 | tmp_url = ''.join(tmp_url[:i + len(s)]) |
|---|
| | 538 | break |
|---|
| | 539 | basename = tmp_url.split('/')[-1].rsplit('.', 1)[0] |
|---|
| | 540 | filename = '%s.%s' % (basename, subtype.split(';', 1)[0]) |
|---|
| | 541 | newnames[filename] = newnames.get(filename, 0) + 1 |
|---|
| | 542 | if newnames[filename] > 1: |
|---|
| | 543 | filename = '%s-%d.%s' % (basename, newnames[filename] - 1, subtype) |
|---|
| | 544 | else: |
|---|
| | 545 | filename = src.url.split('/')[-1] |
|---|
| | 546 | srcs[url] = (filename, src.read()) |
|---|
| | 547 | except HTTPError, e: |
|---|
| | 548 | print "Failed to download %s: %s" % (xurl, e) |
|---|
| | 549 | |
|---|
| | 550 | s = ['%s%ssrc="%s"' % (stuff, tag, srcs.get(link, (link,))[0]) for stuff, tag, link in zip(l[::3], l[1::3], l[2::3])] |
|---|
| | 551 | s.append(l[-1]) |
|---|
| | 552 | return ''.join(s), srcs |
|---|
| | 553 | |
|---|
| | 554 | def _get_refs_and_rewrite(html, regex, start_idx, step): |
|---|
| | 555 | srcs = {} |
|---|
| | 556 | l = sre.split(regex, html) |
|---|
| | 557 | for url in l[start_idx::step]: |
|---|
| | 558 | if not srcs.has_key(url): |
|---|
| | 559 | xurl = url |
|---|
| | 560 | try: |
|---|
| | 561 | if not url.startswith('http://'): |
|---|
| | 562 | xurl, src = get_absolute_url(url) |
|---|
| | 563 | else: |
|---|
| | 564 | src = urlopen(xurl) |
|---|
| | 565 | print xurl |
|---|
| | 566 | filename = src.url.split('/')[-1] |
|---|
| | 567 | srcs[url] = (filename, src.read()) |
|---|
| | 568 | except HTTPError, e: |
|---|
| | 569 | print "Failed to download %s: %s" % (xurl, e) |
|---|
| | 570 | return srcs, l |
|---|
| | 571 | |
|---|
| | 572 | def _do_css_link_stuff(html): |
|---|
| | 573 | srcs, l = _get_refs_and_rewrite(html, '(<link[^>]+)href="(.*?\.css)"', 2, 3) |
|---|
| | 574 | |
|---|
| | 575 | s = ['%s%shref="%s"' % (stuff, tag, srcs.get(link, (link,))[0]) for stuff, tag, link in zip(l[::3], l[1::3], l[2::3])] |
|---|
| | 576 | s.append(l[-1]) |
|---|
| | 577 | return ''.join(s), srcs |
|---|
| | 578 | |
|---|
| | 579 | def _do_css_import_stuff(html): |
|---|
| | 580 | srcs, l = _get_refs_and_rewrite(html, '(<style[^>]+>.*?@import url)\((.*?)\)', 2, 3) |
|---|
| | 581 | |
|---|
| | 582 | s = ['%s%s(%s)' % (stuff, tag, srcs.get(link, (link,))[0]) for stuff, tag, link in zip(l[::3], l[1::3], l[2::3])] |
|---|
| | 583 | s.append(l[-1]) |
|---|
| | 584 | return ''.join(s), srcs |
|---|
| | 585 | |
|---|
| | 586 | def _do_css_stuff(css): |
|---|
| | 587 | srcs, l = _get_refs_and_rewrite(css, 'url\([\'"]?(.*?)[\'"]?\)', 1, 2) |
|---|
| | 588 | |
|---|
| | 589 | s = ['%surl(%s)' % (stuff, srcs.get(link, (link,))[0]) for stuff, link in zip(l[::2], l[1::2])] |
|---|
| | 590 | s.append(l[-1]) |
|---|
| | 591 | return ''.join(s), srcs |
|---|
| | 592 | |
|---|
| | 593 | def _get_and_remove_base(html): |
|---|
| | 594 | l = sre.split('(<base href=")(.*?)(".*?/>)', html) |
|---|
| | 595 | base = l[2] |
|---|
| | 596 | del l[1:4] |
|---|
| | 597 | return base, ''.join(l) |
|---|
| | 598 | |
|---|
| | 599 | # FIXME srcs-ben nincs-e utkozes... |
|---|
| | 600 | # FIXME <base> tageket ki kell irtani |
|---|
| | 601 | # vagy inkabb ez alapjan osszeszedni a cuccokat |
|---|
| | 602 | base, html = _get_and_remove_base(html) |
|---|
| | 603 | print '++++', base |
|---|
| | 604 | html, srcs = _do_src_suff(html) |
|---|
| | 605 | html, new_srcs = _do_css_link_stuff(html) |
|---|
| | 606 | srcs.update(new_srcs) |
|---|
| | 607 | html, new_srcs = _do_css_import_stuff(html) |
|---|
| | 608 | srcs.update(new_srcs) |
|---|
| | 609 | for oldurl, (newurl, css) in srcs.items(): |
|---|
| | 610 | if newurl.endswith('.css'): |
|---|
| | 611 | new_css, new_srcs = _do_css_stuff(css) |
|---|
| | 612 | srcs.update(new_srcs) |
|---|
| | 613 | srcs[oldurl] = (newurl, new_css) |
|---|
| | 614 | |
|---|
| | 615 | print '\\\\\\\\ Adding content to zip' |
|---|
| | 616 | for oldname, (newname, content) in srcs.iteritems(): |
|---|