Saturday, November 5, 2011

Urban dictionary and Youtube search for cool slang's and their usage using Python

Urban dictionary is a user maintained dictionary of slang's. A thesaurus search for a word gives all the related slang's, related words could be antonym also.
Youtube, not long ago, started video comment search which is still in beta. Here, comments on all videos are searched for entered search term.

Requirement : To get related slang's from urban dictionary for a particular word, and then query youtube usage of those slang's.

[sourcecode language="python" wraplines="false"]
from BeautifulSoup import BeautifulSoup,NavigableString
import nltk
import os
import re
import urllib2
import webbrowser
import time

def get_soup(url):
#get soup object for the url
try:
page = urllib2.urlopen(url)
except urllib2.URLError, e:
print 'Failed to fetch ' + url
raise e

try:
soup = BeautifulSoup(page)
except HTMLParser.HTMLParseError, e:
print 'Failed to parse ' + url
raise e
return soup

def get_related(word):
word_list=[]
print 'Fetching related words for '+word+'........'
soup=get_soup('http://www.urbandictionary.com/thesaurus.php?term='+word)
for td in soup.findAll('td', {'class':'word'}): #each row
rel_word=td.find('a').contents[0].encode()
print rel_word
word_list.append(rel_word)
return word_list

def get_comment(word,npages):

print 'Fetching comments for '+word+'........'
comments =[]
for i in range(1,npages+1):
soup= get_soup('http://www.youtube.com/comment_search?q='+str(word)+'&ld=1&comment_only=1&hl=en&so=pagerank&page='+str(i))
for span in soup.findAll('span', {'class':'comment-result-comment'}): #each row
comment =''
for text in span.findAll(text=True):
comment = comment+ ' '+ text
comment=re.sub('[ ]+|\n|\r',' ',comment.strip())
comment=re.sub('^[0-9]+[ ]+','',comment)
comments.append(comment.encode('utf-8'))
time.sleep(6)
return comments

def main():
word='spooky'
npages=2
word_list=get_related(word)
#word_list = ['spooky']
file1=open('word_list','w')
for w in word_list:
for comment in get_comment(w,npages):
file1.write(comment+'\n')
file1.close()

if __name__ == '__main__':
main()
[/sourcecode]

1 comment: