python - Recursively scrape website using scrapy -
python - Recursively scrape website using scrapy -
i need recursively crawl through website. have right now:
class demospider(crawlspider): name = 'sample_recursive' allowed_domains = ['www.example.org'] start_urls = [ "http://example.org" ] rules = [rule(sgmllinkextractor(allow=(r'/.org/site/id/[\w*\w*]*'), deny=(r'.org/subscription')), callback='parse_start_url', follow=true)] def parse_start_url(self, response): items = [] item = demosampleitem() item["source_url"] = response.url item["title"] = response.xpath('//div[@class="content-title"]/h2/text()')[0].extract() item["breadcrumb"] = response.xpath("//ul[@class='breadcrumbs']")[0].extract() item["content"] = response.xpath("//div[@class='main_col']")[0].extract() item["right_col"] = response.xpath("//div[@class='right_col']").extract() item["left_col"] = response.xpath("//div[@class='left_col']")[0].extract() item["depth"] = response.meta.get('depth', 0) items.append(item) homecoming items
i want scrape through pages like: "example.org", "example.org/site/id/home", "example.org/site/id/partners" , "example.org/site/id/home/our-values" , save each item pipeline mysql db entry of own.
class acdisamplepipeline(object): # connect db_connection = mysqldb.connect(host='localhost', user='user', passwd='passwd', db='dbname' , charset='utf8', use_unicode=true) # create database cursor cursor = db_connection.cursor() def process_item(self, item, spider): source_url = item["source_url"] title = item["title"].encode('utf-8') breadcrumb = item["breadcrumb"].encode('utf-8') content = item["content"].encode('utf-8') left_col = item["left_col"].encode('utf-8') right_col = item["right_col"].encode('utf-8') depth = item["depth"] try: self.cursor.execute("""insert table_name (source_url, title, breadcrumb, content , right_col, left_col, page_depth) values (%s, %s, %s, %s, %s, %s, %s)""", (source_url , title , breadcrumb , content , right_col , left_col , depth)) self.db_connection.commit() except mysqldb.error, e: print("--------------- printing db error(s) -------------------") print "error while db write %d: %s" % (e.args[0], e.args[1]) homecoming item
but of scraping , saving "example.org" in database. thought why wouldn't recursively go through site?
the regular look in linkextractor looks wrong:
allow=(r'/.org/site/id/[\w*\w*]*') # ^--- slash doesn't belong here. # plus, dot should escaped, else matches character
it seems want regex more like:
allow=(r'[.]org/site/id/.+')
python recursion web-scraping scrapy web-crawler
Comments
Post a Comment