Monday 29 February 2016

Python Project : Media Downloader

Project Description :

The script has to parse a configuration file called feeds.txt that contains four column. The fourth column contains the URL of a xml file. The script should download this xml file and parse this xml and find all media file like .png,.jpg or .pdf file. The links of the media files are relative to the xmlfile.

Following is the sample feeds.txt file :

landau feed1 landaumedia http://www.ps.landaumedia.de/4cf36ec2503400027400014a/Medienlandschaft/xmlfeed.xml
landau feed2 landaumedia http://www.ps.landaumedia.de/4c8df1fe8feeb50d54000091/W%C3%B6chentlicher%20Pressespiegel/xmlfeed.xml
bahlsen  feed1 landaumedia http://www.ps.landaumedia.de/557e5bb721d6bc0e4c6572cc/T%C3%A4glicher%20Pressespiegel/xmlfeed.xml
block-house feed1 landaumedia http://www.ps.landaumedia.de/5065af429295461238cd388d/T%C3%A4glicher%20Pressespiegel/xmlfeed.xml
claas  feed1 landaumedia http://www.ps.landaumedia.de/55f9603c70e1df143858a56b/T%C3%A4glicher%20Pressespiegel/xmlfeed.xml
claas  feed2 meltwater https://app.meltwater.com/gyda/outputs/566fcddff1e0c684a2930228/rendering?apiKey=55944bdf484b6d23a61cd26e&type=xml

The script should read above file with four columns. The first column contains the name of the client. The second column contains the name of the feed, the third column contains the format name of the feed so that you can distinguish between the two type of feeds. The forth and last column contains the actual feed URL.

The script has to create following directory structure automatically for the above feeds.txt file.
├───bahlsen
│   └───feed1
│       └───landaumedia
│           └───2016-02-19
│               ├───article_1
│               ├───article_3
│               ├───article_4
│               ├───article_5
│               ├───article_6
│               ├───article_7
│               ├───article_8
│               └───article_9
├───block-house
│   └───feed1
│       └───landaumedia
│           └───2016-02-19
│               ├───article_1
│               ├───article_2
│               └───article_3
├───claas
│   ├───feed1
│   │   └───landaumedia
│   │       └───2016-02-19
│   │           ├───article_1
│   │           ├───article_2
│   │           ├───article_3
│   │           ├───article_4
│   └───feed2
│       └───meltwater
│           └───2016-02-19
└───landau
    ├───feed1
    │   └───landaumedia
    │       └───2016-02-19
    │           ├───article_1
    │           ├───article_2
    └───feed2
        └───landaumedia
            └───2016-02-19
                ├───article_1
                ├───article_2
                ├───article_3
                ├───article_4
                └───article_5

Note : article_* are the final directories where all media file like .pdf, .jpg, .png etc will store and it will look like below format.

./feeds/client1/feed1/2016-02-15/xmlfeed.xml
./feeds/client1/feed1/2016-02-15/article_1/die welt.png
./feeds/client1/feed1/2016-02-15/article_1/Welt_online_Die_Handelsblatt_plant_Klage_gegen_die_Lufthansa_1.pdf
./feeds/client1/feed1/2016-02-15/article_2/zzz_handelsblatt_com_530.png
./feeds/client1/feed1/2016-02-15/article_2/Handelsblatt_Online_Medienhaus_blaest_Wechsel_der_Rechtsform_ab_2.pdf
./feeds/client1/feed1/2016-02-15/article_3/frankfurter allgemeine zeitung online.png
Python Program for above problem


#!/usr/bin/python
# coding: UTF-8

#__author__ = 'Mukesh.Kumar' 

import os,time,subprocess
from xml.dom import minidom
import urllib,urlparse

def mediaExtractor(xmlUrl,directory):
# mediaListCommand="wget -q -O- "+xmlUrl+" | grep -iPo 'article_.*\.[a-z]{2,4}'"
 xmlFile=directory+"//xmlfeed.xml"
 mediaListCommand="cat "+xmlFile+" | grep -iPo 'article_.*\.[a-z]{2,4}'"
 mediaList=((subprocess.Popen(['/bin/bash', '-c', mediaListCommand], stdout=subprocess.PIPE)).communicate()[0]).splitlines()
 for med in mediaList:
  med1=med.split('/')
  artDir=med1[0]
  finalDir=directory+"//"+artDir
  
  if not os.path.exists(finalDir) and "/" in med:
   os.makedirs(finalDir)
  else:
   pass

  medURL=urlparse.urljoin(xmlUrl,med)
  print artDir
  print medURL
  try:
    urllib.urlretrieve(medURL,finalDir+"//"+med1[1])
  except:
   pass  
 

def feedProcessing(feedList):
        for feed in feedList:
   print "Processing feed : "+feed
   print "-----------------------------"
                feedSplit=feed.split(" ")
                dir1=feedSplit[0]
                dir2=feedSplit[1]
                dir3=feedSplit[2]
                dir4=time.strftime("%Y-%m-%d")
                xmlUrl=feedSplit[3]
                directory=os.getcwd()+"//"+dir1+"//"+dir2+"//"+dir3+"//"+dir4
#               print directory
                if not os.path.exists(directory):
                        os.makedirs(directory)
                else:
                        pass

    # Downloading xml file
    print "------Downloading xml file feed -------------------"
    try:
     urllib.urlretrieve(xmlUrl,directory+"//xmlfeed.xml")
    except:
     print "Some error while downloading xml file"

    try:
     mediaExtractor(xmlUrl,directory)
    except:
     print "Fail to extract media for : "+xmlUrl


def main():
        feedFilepath=os.getcwd()+"//feeds.txt"
        if os.path.exists(feedFilepath) == False:
                print "Feed file not found in current directory"
        else:
                print "Feed file found and processing : "+feedFilepath

                feedList=[]
                feedFile=open(feedFilepath,'r')
                feedLines=feedFile.readlines()
                for line in feedLines:
                        line=line.replace('\t',' ')
                        line=line.replace('\n','')
                        while "  " in line:
                                line=line.replace('  ',' ')

                        feedList.append(line)
                        print line
      print "------------------"
                feedProcessing(feedList)


# main()
if __name__ == "__main__":
    main()

No comments:

Post a Comment

Related Posts Plugin for WordPress, Blogger...