# -*- coding: Windows-31J -*-

require 'fileutils'
require 'uri'
require 'net/http'
require 'net/https'
require 'digest/sha2'
require 'time'
require 'cgi'

$LOAD_PATH.unshift(File.dirname(__FILE__))
require 'downtable.rb'
$LOAD_PATH.shift
Net::HTTP.version_1_2



class ArcGET
	
	MEDIA_EXP = /\.(wav|wma|avi|asf|ogg|mp3|mp4|mpeg|mpg|mid|midi|smf|smaf|m4a|swf|flv|bmp|gif|tiff|tif|png|vob|ogm|mov|rm|divx)$/ni
	
	Waiting       = URLtable::Waiting
	WaitRetry     = URLtable::WaitRetry
	
	SaveSuccess   = URLtable::SaveSuccess
	NewLocation   = URLtable::NewLocation
	NotFound      = URLtable::NotFound
	SkipScheme    = URLtable::SkipScheme
	
	ConectTimeout = URLtable::ConectTimeout
	ConnectError  = URLtable::ConnectError
	Forbidden     = URLtable::Forbidden
	AuthRequest   = URLtable::AuthRequest
	Error4xx      = URLtable::Error4xx
	Error5xx      = URLtable::Error5xx
	EOFreached    = URLtable::EOFreached
	CannotGetNetHTTP = URLtable::CannotGetNetHTTP	# Net::HTTP ł͎擾łȂy[WiTCg͂j
	BadURI        = URLtable::BadURI							# ƂURI݂
	
	
	def initialize(param={:path => nil, :db => nil, :pendingURI => nil})
			@rootExp = []
			@rootMAXlink = 99999999
			@cgirootExp = []
			@cgiMAXlink = 10
			@excludeWayback = false
			@sleepTime = 20.0
			@commitCount = 16
			@savePath = if param[:path] then param[:path] else "./save"    end
			@saveDB   = if param[:db]   then param[:db]   else "./url.db3" end
			@pendingURI = param[:pendingURI]
			@pendingURI = nil if @pendingURI.to_s == ""
			@list = URLtable.new(param={:path => @savePath, :db => @saveDB})
			@outStream = STDOUT
	end
	
	def sleep_setting(set)
		@sleepTime=set
	end
	
	def exclude_wayback(set)
		@excludeWayback = set
	end
	
	def commit_count(set)
		@commitCount = set
	end
	
	def save_path(param={:path => nil, :db => nil, :pendingURI => nil})
		path = param[:path]
		db   = param[:db]
		pendingURI = param[:pendingURI]
		
		if path then
			@savePath = File.expand_path(path)
			@list.dataDir = @savePath
		end
		if db then
			@saveDB = File.expand_path(db)
			@list.saveDB = @saveDB
		end
		if pendingURI then
			@pendingURI = pendingURI
			@pendingURI = nil if @pendingURI.to_s.size == 0
		end
	end
	
	def set_echo(obj=nil)
		tmp=@outStream
		@outStream = obj
		return tmp
	end
	
	def add_root(root, max=nil)
		if max then
			@rootMAXlink = max.to_i
		end
		unless root then
			@rootExp = []
			return
		end
		@rootExp << root
		@rootExp.uniq!
	end
	
	def add_cgi_root(root, max=nil)
		if max then
			@cgiMAXlink = max.to_i
		end
		unless root then
			@cgirootExp = []
			return
		end
		@cgirootExp << root
		@cgirootExp.uniq!
	end
	
	def add_url(url)
		return unless url
		url = url.to_s
		@list.transaction do
			unless @list.exists(url) then
				r = URLtable::ROW.new
				r.url = url
				r.priority = r.priority | 0x40000000
				@list.update r
			else
				@outStream.write("Alerdy Exists #{url}\n") if @outStream
			end
		end
	end
	
	
	def save_param(dat,response)
		uri = dat.uri
		#p "@@",response.code.to_s
		dat.statusCode = response.code.to_s
		dat.bytes = response.body.size
		dat.tryNow = dat.tryNow + 1
		dat.timeStamp = Time.parse(response['Date']) rescue nil
		dat.downloadTime = Time.now
		digest = Digest::SHA256.new
		digest << response.body
		dat.checksum = digest.hexdigest
		dat.contentType = response['Content-Type'].to_s
		#p response
	end
	
	def save_body(dat,response, cnt=0)
		return if cnt > 9
		
		if cnt == 0 then
			uri = dat.uri
			dat.savePath = "/#{uri.host}#{uri.path}"
			if(uri.query.to_s.size>0)then
				dat.savePath = dat.savePath + "_"+URI.escape(uri.query,/[<>:\x27\x22\x2B\x2F\x3F\x5C]/n)
			end
		end
		
		path="#{@savePath}#{dat.savePath}"
		if cnt > 0 then
			path += ".#{cnt}"
		end
		
		path = File.expand_path(path)
		FileUtils.mkdir_p(File.dirname(path))
		
		if File.exist?(path) then
			save_body(dat,response, cnt+1)
		end
		begin
			open(path,'wb'){|f| f.write response.body }
		rescue Errno::EISDIR
			dat.savePath = 	dat.savePath + '/index.html'
			save_body(dat,response)
		end
	end
	
	def check_root(add)
		@rootExp.each do |exp|
			return true if exp.match(add.url.to_s)
			return true if exp.match(add.referrer.to_s)
		end
		return false
	end
	
	def check_cgiroot(add)
		to   = false
		from = false
		@cgirootExp.each do |exp|
			to   = true if exp.match(add.url.to_s)
			from = true if exp.match(add.referrer.to_s)
			return true if to and from
		end
		return false
	end
	
	def add_nextpage_sub(path, dat)
		return unless path
		
		path = CGI::unescapeHTML(path.to_s)
		return if /^(?:javascript|mailto|data|file|tel):/ni.match(path)
		uri = dat.uri
		begin
			path = URI.parse( path.gsub(/[\x00-\x1F\x80-\xFF]/n){|x| '%'+x.unpack('H2')[0] } )
		rescue  URI::InvalidURIError, URI::InvalidComponentError
		#	p "INV #{path}"
			return
		end
		newuri = (uri+path)
		newuri.fragment=nil
		
		add = URLtable::ROW.new
		add.url = newuri.to_s
		add.referrer = dat.url
		add.linkCount = dat.linkCount+1
		if check_cgiroot(add) then
			add.linkCountCGI = dat.linkCountCGI+1
		end
		
		return if add.linkCountCGI > @cgiMAXlink
		
		isroot = check_root(add)
		if isroot and (add.linkCount <= @rootMAXlink) then
			if @rootExp[0].match(add.url) or (@cgirootExp[0] and @cgirootExp[0].match(add.url)) then
				add.priority = add.priority | 0x40000000
			end
			@list.update add,false
		else
			if @pendingURI then
				open(@pendingURI,'a'){|f|
					f.write "#{add.url}\n"
				}
			end
		end
	end
	
	def add_nextpage(dat,response)
		text = response.body
		scriptmode = false
		exp = /(<script|<\/script)\b|\b(href|src|value)(?:\s*=\s*"([^\x22]*)"|=([^\x22\x27> ]+))|\burl\(([^\x29]*)\)/ni
		text.scan( exp ) do |t|
			curr=t.shift.to_s.downcase
			if curr=='<script'
				scriptmode = true
			end
			if curr=='</script'
				scriptmode = false
			end
			curr=t.shift.to_s.downcase
			mediacheck = curr=='value'
			add_nextpage_sub(t[0], dat) if t[0] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))
			add_nextpage_sub(t[1], dat) if t[1] and ( (not mediacheck) or MEDIA_EXP.match(t[1].to_s))
			add_nextpage_sub(t[2], dat) if t[2] and ( (not mediacheck) or MEDIA_EXP.match(t[2].to_s))
		end
	end
	
	def save_newlocation(dat,response)
		add_nextpage_sub(response['location'], dat)
	end
	
	def responseHead(response)
		r=''
		response.each do |name,value|
		  r += "#{name} : #{value}\n"
		end
		return r
	end
	
	def httpget(dat)
		begin
			@outStream.write("Get: #{Time.now.to_s} [#{sprintf('%08X:%3d,%3d',dat.priority.to_i, dat.linkCount, dat.linkCountCGI)}] #{dat.uri.to_s}    <- Referer:#{dat.referrer.to_s} ... ") if @outStream
		rescue URI::InvalidURIError
			@outStream.write("BAD: #{Time.now.to_s} #{dat.url}    <- Referer:#{dat.referrer.to_s}\n") if @outStream
			dat.execond = URLtable::BadURI
			dat.status = ''
			dat.tryNow = dat.tryNow + 1
			dat.downloadTime = Time.now
			@list.update dat
			
			open("badURI.txt","a"){|f| f.write "#{dat.inspect}\n" }
			return
		end
		
		uri = dat.uri
		response = nil
		rescode = nil
		begin
			case uri.normalize.scheme.tr('A-Z','a-z')
			when 'https'
				https = Net::HTTP.new(uri.host, uri.port)
				https.open_timeout = 15
				https.use_ssl = true
				https.verify_mode = OpenSSL::SSL::VERIFY_NONE
				https.verify_depth = 5
				https.start do
					request = Net::HTTP::Get.new( uri.request_uri, { 'Referer'=>dat.referrer.to_s } )
					response = https.request request
				end
			when 'http'
				http = Net::HTTP.new(uri.host, uri.port)
				http.open_timeout = 15
				http.start do
					request = Net::HTTP::Get.new( uri.request_uri, { 'Referer'=>dat.referrer.to_s } )
					response = http.request request
				end
			when 'ttp'
				uri = URI.parse( 'h'+uri.to_s )
				http = Net::HTTP.new(uri.host, uri.port)
				http.open_timeout = 15
				http.start do
					request = Net::HTTP::Get.new( uri.request_uri, { 'Referer'=>dat.referrer.to_s } )
					response = http.request request
				end
			
			when 'mailto','data','ftp','file','tel','mms','rtsp','shinsei'
				response=false
				rescode = "Skip"
				dat.execond = URLtable::SkipScheme
				
			else
				p "UN KNOWN SCHEME * #{uri.scheme}"
				p uri,dat
				exit
			end
			
		rescue Errno::ETIMEDOUT,Timeout::Error
			response = false
			rescode = "TimeOut"
			dat.execond = URLtable::ConectTimeout
			
		rescue SocketError,
		       Errno::EHOSTUNREACH,
		       Errno::ECONNREFUSED,
		       Errno::ECONNRESET
			response = false
			rescode = "NoHost"
			dat.execond = URLtable::ConnectError
		
		rescue Net::HTTPBadResponse
			response = false
			rescode  = "Can'tGet"
			dat.execond = URLtable::CannotGetNetHTTP
		
		rescue EOFError
			response = false
			rescode  = "EOFerr"
			dat.execond = URLtable::EOFreached
		
		rescue Errno::EPIPE
			response = false
			rescode  = "Retry"
			dat.execond = URLtable::WaitRetry
			dat.priority = rand(0x3fffffff)
		
		rescue Object
			timeoutexp = /\btimeout\.rb:.*:in \x60timeout\x27/n
			expline = $@[0]
			#p timeoutexp, expline
			
			if timeoutexp.match( expline ) then
				response = false
				rescode = "TimeOut"
				dat.execond = URLtable::ConectTimeout
			else
				p "----------------------------------"
				p $1.class
				p "----------------------------------"
				p $!
				p $@
				raise
			end
		end
		
		
		case response
		when FalseClass
			dat.status = rescode
			dat.tryNow = dat.tryNow + 1
			dat.downloadTime = Time.now
			dat.body=''
			@list.update dat
		
		when Net::HTTPSuccess
			dat.execond = URLtable::SaveSuccess
			dat.status = "Done"
			dat.body = response.body.to_s
			save_param(dat,response)
			#save_body(dat,response)
			add_nextpage(dat,response)
			@list.update dat
			
		when Net::HTTPMovedPermanently,
				 Net::HTTPTemporaryRedirect,
				 Net::HTTPFound,
				 Net::HTTPSeeOther,
				 Net::HTTPMultipleChoice
			location = response['location'].to_s
			dat.execond = URLtable::NewLocation
			dat.status = "Move"
			dat.body = response.body.to_s
			dat.message = "Location: "+location
			save_param(dat,response)
			#save_body(dat,response)
			save_newlocation(dat,response)
			@list.update dat
			
		when Net::HTTPBadRequest,
		     Net::HTTPClientError
			dat.execond = URLtable::Error4xx
			dat.status = "Err4xx"
			dat.body = response.body.to_s
			save_param(dat,response)
			#save_body(dat,response)
			add_nextpage(dat,response)
			@list.update dat
		
		when Net::HTTPUnauthorized 
			dat.execond = URLtable::AuthRequest
			dat.status = "AuthReq"
			dat.body = response.body.to_s
			save_param(dat,response)
			dat.message = responseHead(response)
			@outStream.write("-*- #{response.code} Authorization Requet -*-\n") if @outStream
			@outStream.write("#{dat.message}\n") if @outStream
			#save_body(dat,response)
			add_nextpage(dat,response)
			@list.update dat
		
		when Net::HTTPForbidden
			dat.execond = URLtable::Forbidden
			dat.status = "Forbidden"
			dat.body = response.body.to_s
			save_param(dat,response)
			#save_body(dat,response)
			add_nextpage(dat,response)
			@list.update dat
		
		when Net::HTTPNotFound,
		     Net::HTTPGone
			dat.execond = URLtable::NotFound
			dat.status = "NotFound"
			dat.body = response.body.to_s
			save_param(dat,response)
			#save_body(dat,response)
			add_nextpage(dat,response)
			@list.update dat
		
		when Net::HTTPServerError
			dat.execond = URLtable::Error5xx
			dat.status = "Err5xx"
			dat.body = response.body.to_s
			save_param(dat,response)
			#save_body(dat,response)
			add_nextpage(dat,response)
			@list.update dat
		
		else
			p response
			p response.body
			raise $!
		end
		
		@outStream.write("#{dat.status}\n") if @outStream
		
	end
	
	def getpage(dat)
			httpget(dat)
	end
	
	def start(cond='',data=[])
		
		begin
			@list.transaction do
				@commitCount.times{
					curr = if cond.size>0 then
						@list.entry( cond,data )
					elsif @excludeWayback then
						@list.entry(
							'(ExeCondition<?) and (NOT URL like ?;) order by priority desc limit 1;', [URLtable::SaveSuccess,'http://web.archive.org/%']
						)
					else
						@list.entry
					end
					
					if curr.url == '' then
					# Sς
						return
					end
					getpage(curr)
					sleep(@sleepTime)
				}
			end
		end while true
	end

	def read(url)
		@list[url]
	end
	
	def check_link(row)
		add_nextpage(row,row)
	end
	
	def each
		@list.each do |row|
			yield row
		end
	end
	
	def rest
		return @list.rest
	end
	
	def my_query
		@list.my_query do |db|
			yield db
		end
	end
	
#	def changeStatus
#	end
=begin
	def check_link_test(row)
		text = row.body
		#p text
		scriptmode = false
		exp = /(<script|<\/script)\b|\b(href|src|value)(?:\s*=\s*"([^\x22]*)"|=([^\x22\x27> ]+))|\burl\(([^\x29]*)\)/ni
		text.scan( exp ) do |t|
			curr=t.shift.to_s.downcase
			if curr=='<script'
				scriptmode = true
			end
			if curr=='</script'
				scriptmode = false
			end
			
			curr=t.shift.to_s.downcase
			mediacheck = curr=='value'
		#	if !scriptmode then
				puts "#{t[0]} is link" if t[0] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))
				puts "#{t[1]} is link" if t[1] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))
				puts "#{t[2]} is link" if t[2] and ( (not mediacheck) or MEDIA_EXP.match(t[0].to_s))
			#	add_nextpage_sub(t[0], dat)
			#	add_nextpage_sub(t[1], dat)
			#	add_nextpage_sub(t[2], dat)
		#	end
		end
		
	end
=end
	
	
end

