Changeset 1978


Ignore:
Timestamp:
09/10/07 14:26:32 (12 years ago)
Author:
jukka
Message:

Worked on #1520. Added a whitelist of sites that we accept as sources for embedded stuff.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/LeMillTool.py

    r1977 r1978  
    3838 
    3939 
     40good_sites=["http://www.youtube.com/", 
     41    "http://video.google.com/", 
     42    "http://s3.amazonaws.com/slideshare/", 
     43    "http://www.macromedia.com/go/", 
     44    "http://odeo.com/", 
     45    "http://fpdownload.macromedia.com/", 
     46    "http://www.schooltube.com/", 
     47    "http://maps.google.com/maps/",  
     48    "http://ourmedia.org/players/1pixelout/audio-player.js",  
     49    "http://channels.ourmedia.org/",  
     50    "http://www.archive.org/"] 
     51 
     52urlfinder=re.compile(r"(http://\S*)", re.IGNORECASE) 
     53 
    4054pattern=re.compile(r""" 
    41     (?P<html_open><[a-z].*?>) # opening html tags, those that begin with '<x', where x is a letter  
     55    (?P<html_open>(<|&lt;)[a-z].*?>|(&gt)) # opening html tags, those that begin with '<x', where x is a letter  
    4256    |(?P<html_close></.*?>) # closing html tags, those that begin with '</' 
    4357    |(?P<url>(?<!"|')http://\S*) # http://something, where http is not preceded with " or ' 
     
    7488    |(?P<embed>embed) 
    7589    |(?P<object>object) 
     90    |(?P<iframe>iframe) 
     91    |(?P<script>script) 
    7692    """, re.IGNORECASE | re.VERBOSE) 
    7793 
     
    95111        # global regex 'pattern' contains expressions to find these cases and groups them by type         
    96112        # These are the methods to replace matched cases of certain type 
    97  
    98         print 'parse_text called' 
    99113     
    100114        def html_open(match): 
     
    103117            tag_match=re.match(whitelist,tag) 
    104118            if tag_match: 
    105                 tag=tag_match.group().lower() 
     119                tag=tag_match.group() 
    106120                if not full_tag.endswith('/>'): # also deals with self-closing tags like <br/> 
    107121                    open_tags.append(tag) 
     
    109123            tag_match=re.match(restricted, tag)             
    110124            if tag_match: 
    111                 print tag_match.group() 
    112                 print tag_match.groups() 
     125                if self.isGoodeEmbed(full_tag): 
     126                    return full_tag 
     127                print 'BAD embed: %s' % full_tag 
    113128                return '' 
    114129            return '' 
     
    210225            # if we're inside tags. I'm not sure, must test this theory. 
    211226 
    212             print match.groupdict() 
    213227            if match.group('html_open'): 
    214228                return html_open(match) 
     
    231245     
    232246        open_tags=[] 
    233         print text 
    234247        return pattern.sub(replacements, text) 
     248 
     249 
     250    def isGoodEmbed(self, code): 
     251        """ Check if sent code is compatible with known nice sources """         
     252         
     253        #1. find urls 
     254        founds = urlfinder.findall(code) 
     255 
     256        #2. check if urls fit to profiles         
     257        for match in founds: 
     258            print match 
     259            permitted=False 
     260            for nice_site in good_sites: 
     261                if match.startswith(nice_site): 
     262                    permitted=True 
     263            if not permitted: 
     264                return False 
     265        return True 
     266 
    235267     
    236268             
     
    768800 
    769801 
     802         
     803 
    770804InitializeClass(LeMillTool) 
Note: See TracChangeset for help on using the changeset viewer.