#-*- coding: utf-8; -*-

# 2009-07-24 katoy
#  SEC の CIK コードの一覧を得る。
#
#  ruby 1.8.7 (2009-06-12 patchlevel 174) [i686-darwin9]
#  nokogiri (1.3.2)
#
# 変更履歴:
#

require 'rubygems'
require 'pp'
require 'nokogiri'        # gem install nokogiri
require 'open-uri'
require 'yaml'
require 'pathname'
require 'benchmark'

# URLS
URLS = [
  'http://www.sec.gov/divisions/corpfin/organization/cfia-123.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-a.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-b.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-c.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-d.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-e.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-f.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-g.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-h.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-i.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-j.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-k.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-l.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-m.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-n.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-o.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-p.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-q.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-r.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-s.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-t.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-uv.htm',
  'http://www.sec.gov/divisions/corpfin/organization/cfia-wxyz.htm',
]

# SEC の RSS を読んで、企業別データページの URL を取得する。
def read_cik url
  list = { }

  doc = Nokogiri::HTML(open(url))
  # /html/body/table[2]/tbody/tr/td[3]/table/tbody/tr[2]/td
  doc.xpath('/html/body/table[2]/tr/td[3]//table/tr').each do |data|
    tds = data.xpath('td')
    if tds.size == 3
      name = tds[0].text.to_sym
      list[name] = {
        :cik => sprintf("%0#{10}d",tds[1].text.to_i),
        :sic => tds[2].text.to_i}
    end
  end
  list
end

def usage
  puts "usage: ruby #{__FILE__} [n]"
end

puts Benchmark.measure {
  list = {}
  URLS.each {|url|
    puts "reading... #{url}"
    sublist = read_cik url
    list.merge!(sublist)
  }
  list.delete(0) # sic = 0 は列名データなので除外する
  puts "Data count = #{list.size}"

  # データ保存
  puts "save to cik.yml"
  open("cik.yml", "w") do |w|
    YAML::dump(list, w)
  end

  # データ読み込みの確認
  File.open( 'cik.yml' ) do |io|
    YAML.load_documents(io){|data|
      puts "Data count = #{data.size}"
    }
  end
}

#--- End of File ---
