运维开发网

ruby爬虫模板

运维开发网 https://www.qedev.com 2020-08-13 11:34 出处:网络 作者:运维开发网整理
require ‘restclient‘ require ‘open-uri‘ require ‘open_uri_redirections‘ require ‘nokogiri‘ require ‘json‘ require ‘yaml‘ require ‘fileutils‘ require ‘base64‘ MAX_RETRY_TIMES = 5 ROOT_DIR = ‘/home/zn/w
require ‘restclient‘ require ‘open-uri‘ require ‘open_uri_redirections‘ require ‘nokogiri‘ require ‘json‘ require ‘yaml‘ require ‘fileutils‘ require ‘base64‘ MAX_RETRY_TIMES = 5 ROOT_DIR = ‘/home/zn/work/small-tools-master/zlk/tu/‘ BASE_URL = ‘https://newceshiao.com/mnkc/tiku/?id=‘ COOKIE = {:VerificationCodeNum => ‘1‘, :QZ_KSUser => ‘UserID=15357507&UserName=ppkao1520606811&UserToken=cw05IVsvRbyxuPoQeQIU4%252bZNshdiFE%252fN6LGCVScB%252bnQLBUYAu7SA7A%253d%253d‘} @cookie = ‘VerificationCodeNum=1; PPKAO=PPKAOSTID%3D987%26PPKAOCEID%3D%26PPKAOSJID%3D%26UserName%3D%26EDays%3D‘ @agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/63.0.3239.84 Chrome/63.0.3239.84 Safari/537.36" @content_type = "application/x-www-form-urlencoded" @download_error = Logger.new(‘download_error.log‘) @no_doc = Logger.new(‘nodoc_error.log‘) @parse_error = Logger.new(‘parse_error.log‘) FileUtils.makedirs(ROOT_DIR) unless File.exists?ROOT_DIR def download_image(image) begin name = Time.now.to_i.to_s + "%04d" % [rand(10000)] suffix = image.sub(/.+\./, ‘‘) img = name + "." + suffix File.open("#{ROOT_DIR}/#{img}", "w") do |f| f.write(open("#{image}").read) end rescue Exception => e puts e.message end return img end def img_base64(image_src) file = open(image_src).read image = Base64.encode64(file) end def get_doc(search_link) retry_times = 0 doc = nil begin #doc = Nokogiri::HTML(open(search_link, 
    # "Cookie" => @cookie,
    # "User-Agent" => @agent,
    # "Referer" => "https://study.chinaedu.com/megrez/synchronous/list.do?gradeCode=0201&specialtyCode=02",
    # "Host" => "study.chinaedu.com",
    # :allow_redirections => :all
    # ))


    #RestClient.post(url, {access_token: access_token, image: image}, {content_type: @content_type}) do |response|
    # body = JSON.parse(response.body)
    # return body["words_result"][0]["words"]
    #end
 RestClient.get(search_link, {:cookies => COOKIE} ) do |response| doc = Nokogiri::HTML(response.follow_redirection) end rescue Exception => e puts e.message retry_times += 1 @download_error.error "download error: #{search_link}" retry if retry_times < MAX_RETRY_TIMES end return doc end def process result = [] pages = Array(18283..18583) pages.each_with_index do |i, index| link = BASE_URL + i.to_s puts link doc = get_doc(link) if doc.nil? @no_doc.error link next end begin ctg_one = doc.css(‘.ttop h3 a‘)[0].text ctg_two = doc.css(‘img‘)[‘src‘] rescue @parse_error.error link next end hash = Hash.new hash[‘ctg_one‘] = ctg_one hash[‘ctg_two‘] = ctg_two result << hash if (index+1)%10 == 0 || index == pages.size - 1 File.open("result.yaml",‘a+‘){|f| YAML.dump(result, f)} result = [] end sleep rand(4..10) end end process

扫码领视频副本.gif

0

精彩评论

暂无评论...
验证码 换一张
取 消

关注公众号