#-*- coding: utf-8; -*-

# 2009-07-24 katoy
#  SEC の CIK コードの一覧を得る。
#
#  ruby 1.8.7 (2009-06-12 patchlevel 174) [i686-darwin9]
#  nokogiri (1.3.2)
#
# 変更履歴:
#

require 'rubygems'
require 'pp'
require 'nokogiri'        # gem install nokogiri
require 'open-uri'
require 'yaml'
require 'json'            # gem installl json
require 'pathname'
require 'benchmark'

# URLS
URLS = [
  'http://www.sec.gov/info/edgar/siccodes.htm',
]

def read_sic url
  list = { }

  doc = Nokogiri::HTML(open(url))
  # /html/body/table[2]/tbody/tr/td[3]/p[4]/font/table/tbody/tr[412]/td[4]
  doc.xpath('/html/body/table[2]/tr/td[3]//table/tr').each do |data|
    tds = data.xpath('td')
    if tds.size == 4
      sic = tds[0].text.to_i
      list[sic] = {:office => tds[1].text.to_i, :industry => tds[3].text.to_sym}
    end
  end
  list
end

def usage
  puts "usage: ruby #{__FILE__} [n]"
end

puts Benchmark.measure {
  list = {}
  URLS.each {|url|
    puts "reading... #{url}"
    sublist = read_sic url
    list.merge!(sublist)
  }

  list.delete(0) # sic = 0 は列名データなので除外する

  puts "Data count = #{list.size}"

  # yaml
  # ======================
  # データ保存
  puts "save to sic.yml"
  open("sic.yml", "w") do |w|
    YAML::dump(list, w)
  end

  # データ読み込みの確認
  File.open( 'sic.yml' ) do |io|
    YAML.load_documents(io){|data|
      puts "Data count = #{data.size}"
    }
  end

  # json
  # ======================
  # データ保存
  puts "save to sic.json"
  open("sic.json", "w") do |w|
    array = []
    list.each_pair do |key, val|
      array << {"sic" => key, "industry" => val[:industry], "office" => val[:office] }
    end
    array = array.sort_by {|a| a["sic"].to_i }

    #w.write array.to_json
    w.write JSON.pretty_generate("totalCount" => array.size, "data" => array)
  end

  # データ読み込みの確認
  File.open( 'sic.json' ) do |io|
    data = JSON.parse(io.read)
    puts "Data count = #{data["data"].size}"
  end
}

#--- End of File ---
