# this is a Unicode character represented in some particular encoding
class UnicodeCharacter
	# unicode code
	# code in that particular encoding
	# character name (like 'eacute')
	def initialize(code_uni, code_enc, name)
		@code_uni = code_uni
		@code_enc = code_enc
		@bytes    = [code_uni].pack('U').unpack('C*')
		@name     = name
	end

	attr_reader :code_uni, :code_enc, :bytes, :name
end

class UnicodeCharacters < Hash
	# a hash based on the first character
	def add_new_character_first_byte(code_uni, code_enc, name)
		first_byte = [code_uni].pack('U').unpack('H2').first
		if self[first_byte] == nil then
			self[first_byte] = Array.new
		end
		self[first_byte].push(UnicodeCharacter.new(code_uni, code_enc, name))
	end
	# a hash based on the whole unicode codepoint
	def add_new_character(code_uni, code_enc, name)
		self[code_uni] = UnicodeCharacter.new(code_uni, code_enc, name)
	end
end

class HyphEncoding
	def initialize(encoding_name)
		@encoding_name = encoding_name
		@unicode_characters_first_byte = UnicodeCharacters.new
		@unicode_characters = UnicodeCharacters.new
		@lowercase_characters = Array.new

		if encoding_name != 'ascii' then
			read_data
		end
	end

	def convert_to_escaped_characters(str)
		if str.kind_of?(Array) then
			str.each_index do |i|
				str[i] = convert_string_to_escaped_characters(str[i])
			end
		elsif str.kind_of?(String) then
			str = convert_string_to_escaped_characters(str)
		end
		return str
	end

	attr_reader :encoding_name, :unicode_characters, :unicode_characters_first_byte, :lowercase_characters

	def convert_string_to_escaped_characters(str)
		skip_this_string = false
		characters = str.unpack('U*')
		new_string = Array.new(characters.length)
		characters.each_index do |i|
			c = characters[i] # character code on position i
			# check if unicode entry with that number exists
			uc = @unicode_characters[c]
			if uc == nil then
				if c < 128 then
					new_string[i] = [c].pack('U')
				elsif c == 8217 # ’
					new_string[i] = "'"
				elsif (c == 0x01FD or c == 0x0301) and @encoding_name == 'ec'
					skip_this_string = true
					new_string[i] = sprintf("[U+%04X]", c)
				else
					puts sprintf("There must be an error: character U+%04X in string '%s' is not ASCII or %s.", c, str, @encoding_name.upcase)
				end
			# an unicode character
			else
				new_string[i] = sprintf("^^%x", uc.code_enc)
			end
		end
		if skip_this_string
			new_string.unshift("% ")
		end
		return new_string.join('')
	end

private
	def read_data
		# fetch the characters
		encoding_data_dir = File.expand_path("../data/encodings", __FILE__)
		filename = "#{encoding_data_dir}/#{@encoding_name}.dat"

		if File.exists?(filename) then
			File.open(filename).grep(/^0x(\w+)\tU\+(\w+)\t(\d*)\t([_a-zA-Z0-9\.]*)$/) do |line|
				# puts line
				code_enc = $1.hex
				code_uni = $2.hex
				if $3.length > 0
					type = $3.to_i
				else
					type = 0
				end
				name = $4
				if type == 1 then
					@unicode_characters_first_byte.add_new_character_first_byte(code_uni, code_enc, name)
					@unicode_characters.add_new_character(code_uni, code_enc, name)
					@lowercase_characters.push(UnicodeCharacter.new(code_uni, code_enc, name))
				end
				@lowercase_characters.sort!{|x,y| x.code_enc <=> y.code_enc}
			end
		else
			# TODO: throw an error
			puts "Invalid encoding name '#{@encoding_name}'."
			puts "File '#{filename}' doesn't exist."
		end
	end
end