From c527a2ff2c33142586cb6e06b7a9b55600cc471e Mon Sep 17 00:00:00 2001
From: dushyant7917 <dushyant7917@gmail.com>
Date: Thu, 12 Oct 2017 21:00:37 +0530
Subject: [PATCH 1/2] crawler to crawl news from livehindustan.com

---
 LiveHindustan/LiveHindustan/__init__.py       |   0
 LiveHindustan/LiveHindustan/__init__.pyc      | Bin 0 -> 184 bytes
 LiveHindustan/LiveHindustan/items.py          |  14 +++
 LiveHindustan/LiveHindustan/middlewares.py    |  56 +++++++++++
 LiveHindustan/LiveHindustan/pipelines.py      |  11 +++
 LiveHindustan/LiveHindustan/settings.py       |  90 ++++++++++++++++++
 LiveHindustan/LiveHindustan/settings.pyc      | Bin 0 -> 342 bytes
 .../LiveHindustan/spiders/__init__.py         |   4 +
 .../LiveHindustan/spiders/__init__.pyc        | Bin 0 -> 192 bytes
 .../LiveHindustan/spiders/news_spider.py      |  60 ++++++++++++
 .../LiveHindustan/spiders/news_spider.pyc     | Bin 0 -> 3289 bytes
 LiveHindustan/README.md                       |  16 ++++
 LiveHindustan/requirements.txt                |  26 +++++
 LiveHindustan/scrapy.cfg                      |  11 +++
 14 files changed, 288 insertions(+)
 create mode 100644 LiveHindustan/LiveHindustan/__init__.py
 create mode 100644 LiveHindustan/LiveHindustan/__init__.pyc
 create mode 100644 LiveHindustan/LiveHindustan/items.py
 create mode 100644 LiveHindustan/LiveHindustan/middlewares.py
 create mode 100644 LiveHindustan/LiveHindustan/pipelines.py
 create mode 100644 LiveHindustan/LiveHindustan/settings.py
 create mode 100644 LiveHindustan/LiveHindustan/settings.pyc
 create mode 100644 LiveHindustan/LiveHindustan/spiders/__init__.py
 create mode 100644 LiveHindustan/LiveHindustan/spiders/__init__.pyc
 create mode 100644 LiveHindustan/LiveHindustan/spiders/news_spider.py
 create mode 100644 LiveHindustan/LiveHindustan/spiders/news_spider.pyc
 create mode 100644 LiveHindustan/README.md
 create mode 100644 LiveHindustan/requirements.txt
 create mode 100644 LiveHindustan/scrapy.cfg

diff --git a/LiveHindustan/LiveHindustan/__init__.py b/LiveHindustan/LiveHindustan/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/LiveHindustan/LiveHindustan/__init__.pyc b/LiveHindustan/LiveHindustan/__init__.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..469f1d45dab0dc5400d3df752ae20c327c6f819d
GIT binary patch
literal 184
zcmZSn%*%Dg=1ycX0~9a<X$K%KW&si@3=F{<AQ3+eAi;n}6ej}3^fU5vQ}t6yi!&+{
z^GeJu4bAmk%=JAo^HMT{Q*(hrnaRb$sfk6&8LoNhnR%)D&P9pkIjKd(`aYRusUUTw
n#U+V(XuSCN%)HE!_;|g7$`THsEjGFNDWy57b|AMD12F>ti19BW

literal 0
HcmV?d00001

diff --git a/LiveHindustan/LiveHindustan/items.py b/LiveHindustan/LiveHindustan/items.py
new file mode 100644
index 0000000..b2abb44
--- /dev/null
+++ b/LiveHindustan/LiveHindustan/items.py
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class LivehindustanItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass
diff --git a/LiveHindustan/LiveHindustan/middlewares.py b/LiveHindustan/LiveHindustan/middlewares.py
new file mode 100644
index 0000000..99ff5d6
--- /dev/null
+++ b/LiveHindustan/LiveHindustan/middlewares.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# http://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class LivehindustanSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
diff --git a/LiveHindustan/LiveHindustan/pipelines.py b/LiveHindustan/LiveHindustan/pipelines.py
new file mode 100644
index 0000000..92463e9
--- /dev/null
+++ b/LiveHindustan/LiveHindustan/pipelines.py
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class LivehindustanPipeline(object):
+    def process_item(self, item, spider):
+        return item
diff --git a/LiveHindustan/LiveHindustan/settings.py b/LiveHindustan/LiveHindustan/settings.py
new file mode 100644
index 0000000..4cdc37e
--- /dev/null
+++ b/LiveHindustan/LiveHindustan/settings.py
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for LiveHindustan project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     http://doc.scrapy.org/en/latest/topics/settings.html
+#     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'LiveHindustan'
+
+SPIDER_MODULES = ['LiveHindustan.spiders']
+NEWSPIDER_MODULE = 'LiveHindustan.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'LiveHindustan (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'LiveHindustan.middlewares.LivehindustanSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'LiveHindustan.middlewares.MyCustomDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'LiveHindustan.pipelines.LivehindustanPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
diff --git a/LiveHindustan/LiveHindustan/settings.pyc b/LiveHindustan/LiveHindustan/settings.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5dbb49ddb1fb656fc054cc9766db4faf047b0896
GIT binary patch
literal 342
zcmZ9H-%7(U6vj{5S^wa~8?SfUs|Bx$AVX&Z3R^2nMc11Wn$QN;Y&5B|+dZdm;R|>=
z1la=lCEs^)$anmo;l*z9y)5Bx5j-Do+Yv$n<VXS)ft>W0+ymu9J^<+f4}E;&hKQN|
zRL|PT&N^Z2&9O7K8!c69lklYPI73X+tVkZ`%w3`2uP-rM6!SEGn=$UL(Ms6+f$Iin
zW^JeT4T}`}dG?VN>69%)EWtmlKuPO$qbLTdx5BvS?lz)vMEi`Is|GDyTCPM}*34Ae
qDEicjuN7vb2eTY0tDMuOvg6HmKgE5smfcECF7ZbjAOi241gF1)&Q*l~

literal 0
HcmV?d00001

diff --git a/LiveHindustan/LiveHindustan/spiders/__init__.py b/LiveHindustan/LiveHindustan/spiders/__init__.py
new file mode 100644
index 0000000..ebd689a
--- /dev/null
+++ b/LiveHindustan/LiveHindustan/spiders/__init__.py
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.
diff --git a/LiveHindustan/LiveHindustan/spiders/__init__.pyc b/LiveHindustan/LiveHindustan/spiders/__init__.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90725312817187922d5394fec776d7b71f83cbf5
GIT binary patch
literal 192
zcmZSn%*%Dg=1ycX0~9a<X$K%KW&si@3=F{<AQ3+eAi;n}6lVg(^fU5vQ}t6yi!&+{
z^GeJu4bAmk%=JAo^HMT{Q*(hrnaRb$sfk6&8LoNhnR%)D&P9pkIjKd(`aYRusUUTw
t#U+V(XuRTr%oLEC`1s7c%#!$cy@JXT4xnu|x%nxjIjMFaHx&ah0{}bGGEx8l

literal 0
HcmV?d00001

diff --git a/LiveHindustan/LiveHindustan/spiders/news_spider.py b/LiveHindustan/LiveHindustan/spiders/news_spider.py
new file mode 100644
index 0000000..b537b3d
--- /dev/null
+++ b/LiveHindustan/LiveHindustan/spiders/news_spider.py
@@ -0,0 +1,60 @@
+# -*- coding: utf-8 -*-
+
+import scrapy
+from scrapy.selector import Selector
+
+def scrape_data(xpath_info):
+    item = []
+    for i in xpath_info:
+        item.append(i)
+
+    return item
+
+class NewsSpider(scrapy.Spider):
+    name = "live_hindustan"
+
+    allowed_domains = ['livehindustan.com']
+
+    start_urls = [
+    'http://www.livehindustan.com/national/news-1',
+    'http://www.livehindustan.com/international/news-1',
+    'http://www.livehindustan.com/sports/news-1',
+    'http://www.livehindustan.com/business/news-1',
+    'http://www.livehindustan.com/cricket/news-1',
+    'http://www.livehindustan.com/entertainment/news-1',
+    'http://www.livehindustan.com/gadgets/news-1',
+    'http://www.livehindustan.com/lifestyle/news-1'
+    ]
+
+    def parse(self, response):
+        news_titles = scrape_data(Selector(response).xpath('//div[@class="upper-first "]/h4/a/text()').extract())
+
+        news_urls = scrape_data(Selector(response).xpath('//div[@class="upper-first "]/h4/a/@href').extract())
+
+        image_urls = scrape_data(Selector(response).xpath('//div[@class="upper-first "]/a/img/@src').extract())
+
+        news_summary = scrape_data(Selector(response).xpath('//div[@class="upper-first "]/div/p/text()').extract())
+
+        hindi_month = ['जनवरी,','फरवरी,','मार्च,','अप्रैल,','मई,','जून,','जुलाई,','अगस्त,','सितंबर,','अक्तूबर,','नवम्बर,','दिसम्बर,']
+        english_month = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
+
+        news_date_time = scrape_data(Selector(response).xpath('//div[@class="list-time-tags tags-list"]/span/text()[not(ancestor::*[@class="list-tags"])]').extract())
+        date_time_list = []
+        for i in news_date_time:
+            if i != ' ':
+                for hm,em in zip(hindi_month,english_month):
+                    i = i.replace(hm.decode('UTF-8'),em)
+
+                date_time_list.append(i)
+
+        for i,j,k,l,m in zip(news_titles,news_urls,image_urls,news_summary,date_time_list):
+            print "News title : " + i
+            print "News link : " + 'http://www.livehindustan.com' + j
+            print "News image url : " + k
+            print "News summary : " + l
+            print "News date & time : " + m
+            print "\n"
+
+        next_page = 'http://www.livehindustan.com' + response.xpath('//ul[@class="pagination"]/li/a/@href').extract()[-1]
+        if next_page is not None:
+            yield response.follow(next_page, callback=self.parse)
diff --git a/LiveHindustan/LiveHindustan/spiders/news_spider.pyc b/LiveHindustan/LiveHindustan/spiders/news_spider.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17a6da3bd43185076a197ea1df643295ac28bbbe
GIT binary patch
literal 3289
zcmc&0Yi}DxaPFKqapE*dTN+AR>W0!fA<n51QcDC?LSIp7)5>ldHEm>FZ#VWi_i%1E
zO@hu+At03xd_+iyk1j%hDujg8Pw-Rlf8Yne%$)PAguhXr_cc4OotfR4)ZYVrQ(yh{
zz=8HJh2u0f^)Yq={|0yfi^<#s&j8<qMH38>f-i6*1)>WiK&=aw4G`S~+yZ#8*iEFR
zd>Yxf|KM*fZ{Sq-Vpo!$baWUw1R;Ucu3%Gn>?!~a12#@T3~+Vn2sBKH4X6RcCTtmC
ztPa2e0!qxU&6T9s^|}m1o_Og#>^ADQu9n?kIn-UabhY#~33N#aaHmjlu&SXiEwQGm
zn|7ctesJNUb@ie(Ydb4Cd@Q47skC)>Q&+>FP;w%-u9bDmeIjSw0BN)x99XHkD`cds
zK(4EDvY4!I>VA}@oXD=rvaq#H3M;Y&7-?g`7&ckuIix8uy+d2{BOG#tx{MvFyNE`{
z-X!Z*47NYc*$ZfVkp|HNq8CI4D!s7S1)`7L)D6W_o^;TMu=7y=a&`LvYPb#BflNA~
zuO<*$)w(`qS?lZTlSH!*!3u2chJo$zQ41H;g-!%+pk;KBYP=Jbs)rHUoa~80I%@TJ
zO}T+oI|T1_;*UOcR;1n*(D4+KPN8i#@Nu?H)lo{t78SXBj6x@^JeLA*)01rpj)@e9
zv1PGQbf{wp(fBv$df>&VaZJ!8MK=&9cykUS@D<ucNWFr9w0k*py@JrS9|2=-f7oHb
ziGC1S5Cb3%ffxia1eGi_Ff@lj90qX&#8D7$fH($X1jL&lj)Qm$#0d~58MEgVp{Mo>
z?d3-z0~-?JUVJ2$pwS003Vs__QV^w$jZ1)+K2Ag216w9yoQFpyH2R^DCA?ZU#Qji1
zjZPEeI7>*#Vl{6p#2Kjd5E5<}t7-Z#pd4a{4^awYoFI7ywz`q;C-A77C>sONI0TJB
zXbgeK5&6iCS#bu$D2&7dym$xVLp(VP@gPs$g?NZ3=itWd>QiF@vFWKpN1oq3u*k#l
zk;X8@Xgz?H3`B=b!pbmYjGoCFX=~g!0)D1TdAd_`{FqvnaGyM!c060Dk4D$<iHr)%
zZlrW>bjhl|Z`qcX8#+Ix&Lc@@<mqZ8mpkLymg`rnX%#tYtP@|<&#G^0F@)Hy=J&1U
zkFDnKt>&+-=T9e80{oel2Z1kI%|96J>sIr-33Y_Yzic&sYCT5?l0R!Tf16M}3`gW=
z6N*v;iFp1wO4(+P%g+##xgky>5&3-EYQAJv&28o{Tg^XP&1S2K<iDVZ33Y5+=r;)_
zb56DreUh`h;N86>KQJp~xMxdK`nowY$K&;DAM<#7rpV*W-DJEy^B}>SPw;Ne-%9r9
zZ}3>Uc9+LHSMKpRcjrEjSFc@Bn8^?L*>jaHXxEnoZC6x|x&py4KvW&mTM{4-gHY$~
zz`;i;jHaf>cVa3bN21ZCu_aEhIn|9q=I>oETv7u#qpX(GuJ&YZDyQiEVu<GkD+D-+
zfd9%foSvA%_3etxtwkP@4Q}&MYrbzsn*<|!GbW~3nLCF<eU?dSzfYaT-m=!b9o6f0
z#ZA%?p4D^lGH`rzxx$5tV|(6XECe`n7U@()Y@lhz<%Oom!V47Hj%E{oiuD1(B3bur
zM{3IRLOP+4MJ|7+5}6BwWMet>yl|bl)$K^hJiYH+tSIR%vj&mG^cN_}^+P}<1shW7
z45IQH&7rt>VC$55NjB!qwr!Z2lDAv-!$4OxxkUyQ^j)<Lp|XHU$aE;gms~0Atjgr)
zBB`$_6IFd4+9g$u{l3Dz$GzWHa6^~T%JOxTT*XtUCp9&R9T;cK<Qq4}%`;qeWsQ@@
zY4fN#X=Jbl%Osz9%E%gAwz31t<-qo3xy&Xmqp`%AM@zB<?P!Wa43ltOin0iOtP2u%
zU}+d>b_Xl6YXP49f0bvELXiB;4^`Soo0(L~*!yKt$#J<Rr4n_M55?8$<|Kw97wa7P
gd<QtFlQNZ=zqX^XgFZ}La+w}ZMMuaQLuRJuUrVrwk^lez

literal 0
HcmV?d00001

diff --git a/LiveHindustan/README.md b/LiveHindustan/README.md
new file mode 100644
index 0000000..578ff28
--- /dev/null
+++ b/LiveHindustan/README.md
@@ -0,0 +1,16 @@
+# livehindustan.com  crawler
+
+### Steps to run the crawler
+*Inside the root directory of the folder run the following commands in sequence*
+```
+virtualenv -p python2 venv
+```
+```
+source venv/bin/activate
+```
+```
+pip install -r requirements.txt
+```
+```
+scrapy crawl live_hindustan
+```
diff --git a/LiveHindustan/requirements.txt b/LiveHindustan/requirements.txt
new file mode 100644
index 0000000..8adbdda
--- /dev/null
+++ b/LiveHindustan/requirements.txt
@@ -0,0 +1,26 @@
+asn1crypto==0.23.0
+attrs==17.2.0
+Automat==0.6.0
+cffi==1.11.2
+constantly==15.1.0
+cryptography==2.0.3
+cssselect==1.0.1
+enum34==1.1.6
+hyperlink==17.3.1
+idna==2.6
+incremental==17.5.0
+ipaddress==1.0.18
+lxml==4.0.0
+parsel==1.2.0
+pyasn1==0.3.7
+pyasn1-modules==0.1.4
+pycparser==2.18
+PyDispatcher==2.0.5
+pyOpenSSL==17.3.0
+queuelib==1.4.2
+Scrapy==1.4.0
+service-identity==17.0.0
+six==1.11.0
+Twisted==17.9.0
+w3lib==1.18.0
+zope.interface==4.4.3
diff --git a/LiveHindustan/scrapy.cfg b/LiveHindustan/scrapy.cfg
new file mode 100644
index 0000000..b5f9cd0
--- /dev/null
+++ b/LiveHindustan/scrapy.cfg
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.org/en/latest/deploy.html
+
+[settings]
+default = LiveHindustan.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = LiveHindustan

From 30ca2e00eeeeba641d91dc527a600301a375b725 Mon Sep 17 00:00:00 2001
From: dushyant7917 <dushyant7917@gmail.com>
Date: Thu, 12 Oct 2017 21:03:52 +0530
Subject: [PATCH 2/2] comment added to news_spider.py

---
 LiveHindustan/LiveHindustan/spiders/news_spider.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/LiveHindustan/LiveHindustan/spiders/news_spider.py b/LiveHindustan/LiveHindustan/spiders/news_spider.py
index b537b3d..aa8d40b 100644
--- a/LiveHindustan/LiveHindustan/spiders/news_spider.py
+++ b/LiveHindustan/LiveHindustan/spiders/news_spider.py
@@ -35,9 +35,6 @@ def parse(self, response):
 
         news_summary = scrape_data(Selector(response).xpath('//div[@class="upper-first "]/div/p/text()').extract())
 
-        hindi_month = ['जनवरी,','फरवरी,','मार्च,','अप्रैल,','मई,','जून,','जुलाई,','अगस्त,','सितंबर,','अक्तूबर,','नवम्बर,','दिसम्बर,']
-        english_month = ['JAN','FEB','MAR','APR','MAY','JUN','JUL','AUG','SEP','OCT','NOV','DEC']
-
         news_date_time = scrape_data(Selector(response).xpath('//div[@class="list-time-tags tags-list"]/span/text()[not(ancestor::*[@class="list-tags"])]').extract())
         date_time_list = []
         for i in news_date_time:
@@ -47,6 +44,8 @@ def parse(self, response):
 
                 date_time_list.append(i)
 
+        # The below code just prints the news
+        # It can be processed to do something like inserting it in a database etc.
         for i,j,k,l,m in zip(news_titles,news_urls,image_urls,news_summary,date_time_list):
             print "News title : " + i
             print "News link : " + 'http://www.livehindustan.com' + j