python - Recursively scrape website using scrapy -

i need recursively crawl through website. have right now:

class demospider(crawlspider):     name = 'sample_recursive'     allowed_domains = ['www.example.org']     start_urls = [         "http://example.org"     ]      rules = [rule(sgmllinkextractor(allow=(r'/.org/site/id/[\w*\w*]*'), deny=(r'.org/subscription')), callback='parse_start_url', follow=true)]     def parse_start_url(self, response):         items = []         item = demosampleitem()         item["source_url"] = response.url         item["title"] = response.xpath('//div[@class="content-title"]/h2/text()')[0].extract()         item["breadcrumb"] = response.xpath("//ul[@class='breadcrumbs']")[0].extract()         item["content"] = response.xpath("//div[@class='main_col']")[0].extract()         item["right_col"] = response.xpath("//div[@class='right_col']").extract()         item["left_col"] = response.xpath("//div[@class='left_col']")[0].extract()         item["depth"] = response.meta.get('depth', 0)         items.append(item)           homecoming items

i want scrape through pages like: "example.org", "example.org/site/id/home", "example.org/site/id/partners" , "example.org/site/id/home/our-values" , save each item pipeline mysql db entry of own.

class acdisamplepipeline(object):      # connect     db_connection = mysqldb.connect(host='localhost', user='user', passwd='passwd', db='dbname'                                     , charset='utf8', use_unicode=true)      # create database cursor     cursor = db_connection.cursor()      def process_item(self, item, spider):         source_url = item["source_url"]         title = item["title"].encode('utf-8')         breadcrumb = item["breadcrumb"].encode('utf-8')         content = item["content"].encode('utf-8')         left_col = item["left_col"].encode('utf-8')         right_col = item["right_col"].encode('utf-8')         depth = item["depth"]          try:             self.cursor.execute("""insert table_name (source_url, title, breadcrumb, content                                 , right_col, left_col, page_depth)                                 values (%s, %s, %s, %s, %s, %s, %s)""",                                 (source_url                                  , title                                  , breadcrumb                                  , content                                  , right_col                                  , left_col                                  , depth))              self.db_connection.commit()          except mysqldb.error, e:             print("--------------- printing db error(s) -------------------")             print "error while db write %d: %s" % (e.args[0], e.args[1])           homecoming item

but of scraping , saving "example.org" in database. thought why wouldn't recursively go through site?

the regular look in linkextractor looks wrong:

allow=(r'/.org/site/id/[\w*\w*]*')       #   ^--- slash doesn't belong here.       #   plus, dot should escaped, else matches character

it seems want regex more like:

allow=(r'[.]org/site/id/.+')

python recursion web-scraping scrapy web-crawler

Search This Blog

Four

python - Recursively scrape website using scrapy -

Comments

Post a Comment

Popular posts from this blog

formatting - SAS SQL Datepart function returning odd values -

php - Yii 2: Unable to find a class into the extension 'yii2-admin' -

c++ - Apple Mach-O Linker Error(Duplicate Symbols For Architecture armv7) -