Sunday, September 4, 2011

Extracting IMDB data using Python

Sample code for extracting IMDB data using python BeautifulSoup package.

Requirement : To extract all the feature movie names and their ratings from IMDB database for a particular year.

Parameters used in query were identified using IMDB advanced search function. Start, count and year parameters were used in this case for querying.The url is queried for 100 records at a time since more than that is not allowed. After extracting movie names and rating for 100 records, the url is queried for next 100 records and so on.

Two files 'imdb_conf' and 'ratings' are created by the code. 'imdb_conf' file keeps track of the record number last read and 'ratings' file stores the movie name, rating and year.

Web scrapping the IMDB website for required data.

[sourcecode language="python"]
from BeautifulSoup import BeautifulSoup
import os
import re
import urllib2


def get_start_pos_yr(fimdb_config):
#for starting after last fetched record
#last line contains the last record fetched
nlines = fimdb_config.readlines()
startfrom = -1
year = None
if len(nlines) > 1:
list_num =re.search('[^\t]+',nlines[-1])
if list_num:
startfrom = int(list_num.group())+1
year =re.search('\t[0-9]+',nlines[-1]).group().strip()
return startfrom,year


def get_soup(url):
#get soup object for the url
try:
page = urllib2.urlopen(url)
except urllib2.URLError, e:
print 'Failed to fetch ' + url
raise e

try:
soup = BeautifulSoup(page)
except HTMLParser.HTMLParseError, e:
print 'Failed to parse ' + url
raise e
return soup



def get_ntotal(soup):
#fetch total number of records present for particular query
total_count=1
for div in soup.findAll('div', {'id':'left'}):
#print ivd.contents[0]
total_count = re.search('[ ]+[0-9,]+',div.contents[0])
if total_count:
total_count=total_count.group().replace(',','').strip()
#print "total"+total_count
return total_count


def set_rating(soup,fwimdb_config,frating,year,startfrom):
cond = True
count_rec=0
total_res=get_ntotal(soup)
year=str(year)
#total_res=100
while cond:
for tr in soup.findAll('tr', {'class':re.compile('(odd|even)[ a-zA-Z]*')}): #each row
for td in tr.findAll('td', {'class':'title'} ):
for link in td.findAll('a',{'href':re.compile('/title/tt[^/]+/$')}):
movie_name=link.contents[0] #title name
for rating in td.findAll('div',{'class':'rating rating-list'}):
count_rec=count_rec+1
if rating.has_key('title'):
#print "hurray"
rt = re.search('[0-9]+[^(]+',rating['title']) #rating
if rt:
frating.write(movie_name+"\t"+rt.group().strip()+"\t"+year+"\n")
else:
frating.write(movie_name+"\t--\t"+year+"\n")
else:
frating.write(movie_name+"\t--\t"+year+"\n")
#print movie_name+"\t"+rt.group()
fwimdb_config.write(str(count_rec)+"\t"+year+"\n")
if startfrom == 0:
startfrom = 101 #second run
else:
startfrom = startfrom + 100

if startfrom >= int(total_res):
cond=False
fwimdb_config.write("-1"+"\t"+str(int(year)-1)+"\n")
print str(startfrom)+" "+str(total_res)
soup=get_soup("http://www.imdb.com/search/title?languages=en&title_type=feature&count=100&sort=num_votes,desc&start="+str(startfrom)+"&year="+year)


def main():

fwimdb_conf=open("imdb_conf","r+")
frating = open("ratings","a") #ratings
fwimdb_conf.write("LastreadLine\tYear\n")
startfrom,year = get_start_pos_yr(fwimdb_conf)

if startfrom == -1:
startfrom = 0
if year == None:
year="2010"
print startfrom

soup=get_soup("http://www.imdb.com/search/title?languages=en&title_type=feature&sort=num_votes,desc&count=100&start="+str(startfrom)+"&year="+year)
set_rating(soup,fwimdb_conf,frating,year,startfrom)

frating.close()
fwimdb_conf.close()


if __name__ == '__main__':
main()
[/sourcecode]

For demonstration purposes only. If you plan to use IMDB data beyond personal usage, you should contact IMDB Licensing department.

1 comment:

  1. Hello! I just wanted to ask if you ever have any trouble with hackers?

    My last blog (wordpress) was hacked and I
    ended up losing months of hard work due to no backup.

    Do you have any solutions to stop hackers?

    ReplyDelete