The Computer Language
Benchmarks Game

k-nucleotide Ruby #7 program

source code

# The Computer Language Benchmarks Game
# http://benchmarksgame.alioth.debian.org
#
# contributed by Aaron Tavistock

def find_frequencies(keys)
  @frequencies = {}
  threads = []

  key_lengths = keys.map(&:size).uniq
  key_lengths.each do |key_length|
    threads << Thread.new do
      results_hash = key_frequency(key_length, @seq)
      @frequencies.merge!(results_hash)
    end
  end
  threads.each(&:join)
  @frequencies
end

def forking_key_frequency(key_length, seq)
  reader, writer = IO.pipe

  pid = Process.fork do
    begin
      reader.close
      results = original_key_frequency(key_length, seq)
      Marshal.dump(results, writer)
    ensure
      writer.close
    end
  end

  writer.close
  begin
    results = Marshal.load(reader)
  ensure
    reader.close
  end
  Process.waitpid(pid)

  results
end

def key_frequency(key_length, seq)
  count = Hash.new(0)
  start_index = 0
  last_length = seq.size - key_length
  while start_index < last_length
    key = seq.byteslice(start_index, key_length)
    count[key] += 1
    start_index += 1
  end
  count
end

def frequency(keys)
  keys.map do |key|
    [key, @frequencies[key]]
  end
end

def percentage(keys)
  frequency(keys).sort { |a,b| b[1] <=> a[1] }.map do |key, value|
    "%s %.3f" % [ key.upcase, ( (value*100).to_f / @seq.size) ]
  end
end

def count(keys)
  frequency(keys).sort_by { |a| a[0].size }.map do |key, value|
    "#{value.to_s}\t#{key.upcase}"
  end
end

def load_sequence(marker)
  input = STDIN.read
  start_idx = input.index(marker) + marker.size
  @seq = input[start_idx, input.size - 1]
  @seq.delete!("\n ")
  @seq.freeze
  @seq
end

if RUBY_PLATFORM != 'java'
  class << self
    alias_method :original_key_frequency, :key_frequency
    alias_method :key_frequency, :forking_key_frequency
  end
end

singles = %w(a t c g)
doubles = %w(aa at ac ag ta tt tc tg ca ct cc cg ga gt gc gg)
chains  = %w(ggt ggta ggtatt ggtattttaatt ggtattttaatttatagt)

load_sequence('>THREE Homo sapiens frequency')
find_frequencies(singles + doubles + chains)

print "#{percentage(singles).join("\n")}\n\n"
print "#{percentage(doubles).join("\n")}\n\n"
print "#{count(chains).join("\n")}\n"
    

notes, command-line, and program output

NOTES:
64-bit Ubuntu quad core
ruby 2.4.0p0 (2016-12-24 revision 57164) [x86_64-linux]


Sun, 12 Feb 2017 06:00:26 GMT

COMMAND LINE:
/usr/local/src/ruby/bin/ruby -W0 knucleotide.yarv-7.yarv 0 < knucleotide-input25000000.txt

PROGRAM OUTPUT:
A 30.295
T 30.151
C 19.800
G 19.754

AA 9.177
TA 9.132
AT 9.131
TT 9.091
CA 6.002
AC 6.001
AG 5.987
GA 5.984
CT 5.971
TC 5.971
GT 5.957
TG 5.956
CC 3.917
GC 3.911
CG 3.909
GG 3.902

1471758	GGT
446535	GGTA
47336	GGTATT
893	GGTATTTTAATT
893	GGTATTTTAATTTATAGT