Changeset 1916
- Timestamp:
- 08/03/07 14:06:56 (5 years ago)
- Files:
-
- trunk/Collection.py (modified) (15 diffs)
Legend:
- Unmodified
- Added
- Removed
- Modified
- Copied
- Moved
trunk/Collection.py
r1911 r1916 499 499 500 500 #Zip / SCORM download stuff begin -------------------------------------------------------------------------------- 501 def _splitHTML(self, htmlPage ):501 def _splitHTML(self, htmlPage, pageURL): 502 502 503 503 def flatten(seq): … … 526 526 elif previousInfo == None: 527 527 previousInfo = '' 528 infoList.insert(i +1, None)529 inputList.insert(i +1, text)528 infoList.insert(i + 1, None) 529 inputList.insert(i + 1, text) 530 530 if previousInfo == None: 531 531 infoList.insert(0, None) … … 552 552 return inputList, infoList 553 553 554 splitHtml, infoList = reqursiveRegularSplit(htmlPage, None, 'base', '<base href="(.*?)".*?/>', 2, 4)554 splitHtml, infoList = reqursiveRegularSplit(htmlPage, None, 'base', '<base href="(.*?)".*?/>', 1, 2) 555 555 if len(splitHtml) > 1: 556 556 baseURL = splitHtml[-2] … … 558 558 del infoList[1::2] 559 559 else: 560 baseURL = self.portal_url()560 baseURL = pageURL 561 561 if not baseURL.endswith('/'): 562 562 baseURL += '/' 563 splitHtml, infoList = reqursiveRegularSplit(splitHtml, infoList, None, r'<a[^>]*?>(\s*<img.*?>\s*)</a>', 1, 2) 563 564 splitHtml, infoList = reqursiveRegularSplit(splitHtml, infoList, 'src', '(<link[^>]+href=")(.*?)("[^>]+rel="stylesheet")', 2, 4) 565 splitHtml, infoList = reqursiveRegularSplit(splitHtml, infoList, 'href', '(<a[^>]+)href="(.*?)"', 2, 3) 564 566 splitHtml, infoList = reqursiveRegularSplit(splitHtml, infoList, 'fvars', '(<embed[^>]*?flashvars=")(.*?)(")', 2, 4) 565 splitHtml, infoList = reqursiveRegularSplit(splitHtml, infoList, 'XMLmp3', '(<voiceover[^>]+ )(src=")(.*?)(")', 3, 5)566 splitHtml, infoList = reqursiveRegularSplit(splitHtml, infoList, 'src', '(<[^>]+ )(src=")(.*?)(")', 3, 5)567 splitHtml, infoList = reqursiveRegularSplit(splitHtml, infoList, 'XMLmp3', '(<voiceover[^>]+src=")(.*?)(")', 2, 4) 568 splitHtml, infoList = reqursiveRegularSplit(splitHtml, infoList, 'src', '(<[^>]+src=")(.*?)(")', 2, 4) 567 569 splitHtml, infoList = reqursiveRegularSplit(splitHtml, infoList, 'src', '(<param.+?name="movie".*?value=")(.*?)(")', 2, 4) 568 570 splitHtml, infoList = reqursiveRegularSplit(splitHtml, infoList, 'fvars', r"(<script.*?>.*?AC_FL_RunContent\(.*?'flashvars', ')(.*?)((?<!\\)'.*?</script>)", 2, 4) … … 572 574 return flatten(splitHtml), flatten(infoList), baseURL 573 575 574 def _getAbsoluteURL(self, url, info, baseURL= None):576 def _getAbsoluteURL(self, url, info, baseURL=''): 575 577 flash = None 576 if info in ('src', 'jsfname', 'XMLmp3'): 577 if url.startswith('http://') or baseURL == None: 578 url2 = url 579 else: 580 url2 = baseURL + url 578 if info in ('src', 'jsfname', 'XMLmp3', 'href'): 579 if not url.startswith('http://'): 580 url = baseURL + url 581 581 if info == 'jsfname': 582 url 2+= '.swf'582 url += '.swf' 583 583 elif info == 'fvars': 584 584 if url.startswith('file='): 585 url 2= url[5:-5]585 url = url[5:-5] 586 586 flash = 'mp3' 587 587 elif url.startswith('xml='): 588 url 2= url[4:]588 url = url[4:] 589 589 flash = 'pilot' 590 590 elif url.startswith("config={videoFile: '"): 591 url 2= url[20:-2]591 url = url[20:-2] 592 592 flash = 'flv1' 593 593 elif url.startswith("config={videoFile: \\'"): 594 url 2= url[21:-3]594 url = url[21:-3] 595 595 flash = 'flv2' 596 return url2, flash 597 598 def _getFileNameOfURL(self, url): 599 return sre.split(r'[/\\]', url.split('?')[0])[-1] 596 url = url.replace('\\', '/') 597 return url, flash 598 599 def _splitFileName(self, fileName): 600 splitted = fileName.split('.') 601 if len(splitted) > 1: 602 return '.'.join(splitted[:-1]), '.' + splitted[-1] 603 return fileName, '' 600 604 601 605 def _hasFileNameInSrcs(self, fileName, srcs): … … 603 607 src = srcs[absURL] 604 608 if src.has_key('fileName') and src['fileName'] == fileName: 605 return True, src 606 return False, None 607 608 def _splitFileName(self, fileName): 609 splitted = fileName.split('.') 610 if len(splitted) > 1: 611 return '.'.join(splitted[:-1]), splitted[-1] 612 return fileName, '' 609 return src 610 return None 613 611 614 612 def _findUniqueFileName(self, absURL, srcs): 615 fileName = self._getFileNameOfURL(absURL) 616 hasFileName, src2 = self._hasFileNameInSrcs(fileName, srcs) 617 if hasFileName: 613 fileName = absURL.split('?')[0].split('/')[-1] 614 src2 = self._hasFileNameInSrcs(fileName, srcs) 615 if src2 == None: 616 return fileName 617 else: 618 618 src2['counter'] += 1 619 619 fileName, extension = self._splitFileName(fileName) 620 620 return self._findUniqueFileName('%s(%d)%s' % (fileName, src2['counter'], extension), srcs) 621 else: 622 return fileName 623 624 def _processHtml(self, splitHtml, infoList, srcs, baseURL, htmlIndex): 621 622 def _processHtml(self, splitHtml, infoList, srcs, hrefs, baseURL, htmlIndex): 625 623 for i in range(len(infoList)): 626 if infoList[i] in ('src', 'jsfname', 'fvars', 'XMLmp3' ):624 if infoList[i] in ('src', 'jsfname', 'fvars', 'XMLmp3', 'href'): 627 625 absURL, flash = self._getAbsoluteURL(splitHtml[i], infoList[i], baseURL) 628 if not srcs.has_key(absURL): 629 srcs[absURL] = {} 630 src = srcs[absURL] 631 src['fileName'] = self._findUniqueFileName(absURL, srcs) 632 src['finalURL'] = '' 633 src['usedBy'] = set([htmlIndex]) 634 src['counter'] = 0 635 if flash == 'mp3': 636 src['extension'] = 'mp3' 637 elif flash == 'pilot': 638 src['extension'] = 'xml' 639 elif flash in ('flv1', 'flv2'): 640 src['extension'] = 'flv' 641 elif infoList[i] == 'XMLmp3': 642 src['extension'] = 'mp3' 626 if infoList[i] != 'href' or absURL.find('at_download') != -1: 627 if srcs.has_key(absURL): 628 srcs[absURL]['usedBy'].add(htmlIndex) 643 629 else: 644 src['extension'] = '' 630 srcs[absURL] = {} 631 src = srcs[absURL] 632 src['fileName'] = self._findUniqueFileName(absURL, srcs) 633 src['finalURL'] = '' 634 src['usedBy'] = set([htmlIndex]) 635 src['counter'] = 0 636 if flash == 'mp3': 637 src['extension'] = 'mp3' 638 elif flash == 'pilot': 639 src['extension'] = 'xml' 640 elif flash in ('flv1', 'flv2'): 641 src['extension'] = 'flv' 642 elif infoList[i] == 'XMLmp3': 643 src['extension'] = 'mp3' 644 else: 645 src['extension'] = '' 646 if infoList[i] == 'href': 647 absURL, flash = self._getAbsoluteURL(splitHtml[i], infoList[i], baseURL) 648 if hrefs.has_key(splitHtml[i]): 649 hrefs[absURL]['usedBy'].add(htmlIndex) 645 650 else: 646 srcs[absURL]['usedBy'].add(htmlIndex) 651 hrefs[absURL] = {} 652 href = hrefs[absURL] 653 href['finalURL'] = '' 654 href['type'] = 0 655 href['usedBy'] = set([htmlIndex]) 647 656 648 657 def _processPilotXMLs(self, srcs, baseURLs, baseDirs): … … 653 662 src = srcs[absURL] 654 663 file = urlopen(absURL) 655 #file = urlopen('http://localhost:81/buildXML.xml')656 664 xml = file.read().decode('utf-16') 657 665 file.close() 658 666 xmlDatas[absURL]={} 659 667 xmlData = xmlDatas[absURL] 660 xmlData['split'], xmlData['info'], baseURL = self._splitHTML(xml )668 xmlData['split'], xmlData['info'], baseURL = self._splitHTML(xml, '') 661 669 xmlData['fileName'] = src['fileName'] 662 670 xmlData['baseDirs'] = [] 663 671 for htmlIndex in src['usedBy']: 664 self._processHtml(xmlData['split'], xmlData['info'], srcs, baseURLs[htmlIndex], htmlIndex)672 self._processHtml(xmlData['split'], xmlData['info'], srcs, {}, baseURLs[htmlIndex], htmlIndex) 665 673 xmlData['baseDirs'].append(baseDirs[htmlIndex]) 666 674 except (HTTPError, URLError): … … 668 676 return xmlDatas 669 677 670 def _addExtension(self, fileName, extension):671 name, ext = self._splitFileName(fileName)672 if ext == '' and extension != '':673 fileName = '%s.%s' % (fileName, extension)674 return fileName675 676 678 def _downloadFiles(self, srcs, zip, baseDirs = []): 679 def _addExtension(fileName, extension): 680 name, ext = self._splitFileName(fileName) 681 if ext == '' and extension != '': 682 fileName = '%s.%s' % (fileName, extension) 683 return fileName 677 684 for absURL in srcs: 678 685 src = srcs[absURL] 679 686 data = '' 687 mediaType = '' 680 688 subType = '' 681 689 try: … … 683 691 file = urlopen(absURL) 684 692 data = file.read() 693 mediaType, subType = file.headers.getheader('Content-Type').split('/') 685 694 file.close() 686 mediaType, subType = file.headers.getheader('Content-Type').split('/')687 695 subType = subType.split(';')[0] 688 696 if mediaType == 'image': 689 if subType == 'jpeg':697 if subType in ('jpeg', 'pjpeg'): 690 698 src['extension'] = 'jpg' 691 elif subType in ('png', 'gif', 'bmp'): 699 elif subType == 'x-ms-bmp': 700 src['extension'] = 'bmp' 701 elif subType in ('png', 'gif'): 692 702 src['extension'] = subType 693 pass 703 elif subType == 'x-png': 704 src['extension'] = 'png' 705 if mediaType == 'application' and subType == 'x-shockwave-flash': 706 src['extension'] = 'swf' 707 if mediaType == 'video': 708 if subType in ('x-msvideo', 'avi'): 709 src['extension'] = 'avi' 710 elif subType == 'x-ms-wmv': 711 src['extension'] = 'wmv' 712 elif subType == 'mpeg': 713 src['extension'] = 'mpg' 714 elif subType == 'mp4': 715 src['extension'] = subType 716 elif subType == 'quicktime': 717 src['extension'] = 'mov' 694 718 except (HTTPError, URLError): 695 719 print "File download error: " + absURL 696 720 697 fileName = self._addExtension(src['fileName'], src['extension'])721 fileName = _addExtension(src['fileName'], src['extension']) 698 722 if len(src['usedBy']) > 1: 699 723 src['finalURL'] = '_SharedFiles/' + fileName 700 724 if src['extension'] != 'xml': 701 725 zip.writestr((src['finalURL']).encode('latin-1'), data) 702 src['finalURL'] = '../../' + src['finalURL']703 726 else: 704 727 src['finalURL'] = '_Files/' + fileName … … 707 730 zip.writestr((baseDirs[i] + src['finalURL']).encode('latin-1'), data) 708 731 709 def _updateHtmls(self, splitHtmls, infoLists, baseURLs, srcs): 732 def _processHrefs(self, hrefs, srcs, htmlURLs, baseDirs, portalURL): 733 downloadURLs = [] 734 for absURL in srcs: 735 fileName = absURL.split('?')[0].split('/') 736 domain = '/'.join(fileName[:-1]) 737 fileName = fileName[-1] 738 downloadURLs.append('%s/at_download/%s' % (domain, fileName)) 739 for absURL in hrefs: 740 href = hrefs[absURL] 741 if absURL in htmlURLs: 742 href['finalURL'] = 'href="%sindex.html"' % baseDirs[htmlURLs.index(absURL)] 743 elif absURL + '/' in htmlURLs: 744 href['finalURL'] = 'href="%sindex.html"' % baseDirs[htmlURLs.index(absURL + '/')] 745 elif absURL in srcs: 746 href['finalURL'] = srcs[absURL]['finalURL'] 747 href['type'] = 1 748 elif absURL in downloadURLs: 749 href['finalURL'] = srcs[absURL.replace('/at_download', '')]['finalURL'] 750 href['type'] = 1 751 elif absURL.startswith(portalURL): 752 href['finalURL'] = '' 753 else: 754 href['finalURL'] = 'href="%s"' % absURL 755 756 def _updateHtmls(self, splitHtmls, infoLists, baseURLs, baseDirs, srcs, hrefs): 710 757 for i in range(len(splitHtmls)): 711 758 splitHtml = splitHtmls[i] … … 714 761 if infoList[j] in ('src', 'jsfname', 'fvars'): 715 762 absURL, flash = self._getAbsoluteURL(splitHtml[j], infoList[j], baseURLs[i]) 763 src = srcs[absURL] 764 if baseDirs[i] != '' and len(src['usedBy']) > 1: 765 finalURL = '../../' + src['finalURL'] 766 else: 767 finalURL = src['finalURL'] 716 768 if infoList[j] == 'src': 717 splitHtml[j] = srcs[absURL]['finalURL']769 splitHtml[j] = finalURL 718 770 elif infoList[j] == 'jsfname': 719 splitHtml[j] = '.'.join( srcs[absURL]['finalURL'].split('.')[:-1])771 splitHtml[j] = '.'.join(finalURL.split('.')[:-1]) 720 772 elif infoList[j] == 'fvars': 721 773 if flash == 'mp3': 722 splitHtml[j] = 'file=%s' % srcs[absURL]['finalURL']774 splitHtml[j] = 'file=%s' % finalURL 723 775 elif flash == 'pilot': 724 splitHtml[j] = 'xml=%s' % srcs[absURL]['finalURL']776 splitHtml[j] = 'xml=%s' % finalURL 725 777 elif flash == 'flv1': 726 splitHtml[j] = "config={videoFile: '../%s'}" % srcs[absURL]['finalURL']778 splitHtml[j] = "config={videoFile: '../%s'}" % finalURL 727 779 elif flash == 'flv2': 728 splitHtml[j] = "config={videoFile: \\'../%s\\'}" % srcs[absURL]['finalURL'] 780 splitHtml[j] = "config={videoFile: \\'../%s\\'}" % finalURL 781 elif infoList[j] == 'href': 782 absURL, flash = self._getAbsoluteURL(splitHtml[j], infoList[j], baseURLs[i]) 783 href = hrefs[absURL] 784 if href['type'] == 1: 785 if baseDirs[i] != '' and len(href['usedBy']) > 1: 786 finalURL = 'href="../../%s"' % href['finalURL'] 787 else: 788 finalURL = 'href="%s"' % href['finalURL'] 789 else: 790 finalURL = href['finalURL'] 791 splitHtml[j] = finalURL 729 792 730 793 def _updatePilotXMLs(self, xmlDatas, baseURLs, srcs): … … 735 798 if xmlInfoList[j] in ('src', 'XMLmp3'): 736 799 absURL2, flash = self._getAbsoluteURL(splitXML[j], xmlInfoList[j]) 737 splitXML[j] = srcs[absURL2]['finalURL'] 800 src = srcs[absURL2] 801 if len(src['usedBy']) > 1: 802 splitXML[j] = '../../' + src['finalURL'] 803 else: 804 splitXML[j] = src['finalURL'] 738 805 739 806 def _addHtmlsToZip(self, splitHtmls, zip, baseDirs = None): … … 760 827 baseURLs = [] 761 828 baseDirs = [] 829 htmlURLs = [] 762 830 srcs = {} 831 hrefs = {} 763 832 htmlIndex = 0 764 833 resourceTypes = ('Content', 'Methods', 'Tools') … … 767 836 for resource in self.getResources(reftype = 'related%s' % resourceType): 768 837 htmlList.append((resource, '%s/%s/' % (resourceType.lower(), resource.getId()))) 769 838 770 839 for html in htmlList: 771 splitHtml, infoList, baseURL = self._splitHTML(html[0].standalone_view() )840 splitHtml, infoList, baseURL = self._splitHTML(html[0].standalone_view(), self.portal_url()) 772 841 splitHtmls.append(splitHtml) 773 842 infoLists.append(infoList) 774 843 baseURLs.append(baseURL) 775 844 baseDirs.append(html[1]) 776 self._processHtml(splitHtml, infoList, srcs, baseURL, htmlIndex) 845 htmlURLs.append(html[0].absolute_url() + '/') 846 self._processHtml(splitHtml, infoList, srcs, hrefs, baseURL, htmlIndex) 777 847 htmlIndex += 1 778 848 … … 781 851 zip = zipfile.ZipFile(zipStr, 'w', compression = zipfile.ZIP_DEFLATED) 782 852 self._downloadFiles(srcs, zip, baseDirs) 783 self._updateHtmls(splitHtmls, infoLists, baseURLs, srcs) 853 self._processHrefs(hrefs, srcs, htmlURLs, baseDirs, self.portal_url()) 854 self._updateHtmls(splitHtmls, infoLists, baseURLs, baseDirs, srcs, hrefs) 784 855 self._updatePilotXMLs(xmlDatas, baseURLs, srcs) 785 856 self._addHtmlsToZip(splitHtmls, zip, baseDirs)
