Parallelize search using multiple processes

This commit is a proof-of-concept implementation of parallelized search
to take advantage of multiple cores of the system. A multi-threaded
program written in Ruby cannot saturate more than a single core of CPU
because of GIL. So the only way we can use multiple cores is to spawn
multiple processes, split the work, and merge the partial results. The
result of each child process is passed to the parent via pipe in a
serialized form. This serialization and deserialization of the result is
the unavoidable overhead of using multiple processes instead of multiple
threads. Unfortunately, Ruby's default serialization method,
Marshal.dump is found to be so slow that it often shadows the gain
especially when the selectivity of the query is not very good. So in
this implementation I chose to use MessagePack which is at least 5-times
faster than Marshal. However, this will make installing fzf much
trickier for many people. So I'm not sure if I want to merge this code
into the master branch considering that fzf is already pretty fast in
most cases thanks to its caching scheme and the benefit of
parallelization is only noticeable when the list has more than 50k
items.
This commit is contained in:
Junegunn Choi 2014-04-21 02:44:26 +09:00
parent fe22213b51
commit 501af62661

198
fzf
View File

@ -7,7 +7,7 @@
# / __/ / /_/ __/ # / __/ / /_/ __/
# /_/ /___/_/ Fuzzy finder for your shell # /_/ /___/_/ Fuzzy finder for your shell
# #
# Version: 0.8.3 (April 3, 2014) # Version: 0.9.0-alpha (April 21, 2014)
# #
# Author: Junegunn Choi # Author: Junegunn Choi
# URL: https://github.com/junegunn/fzf # URL: https://github.com/junegunn/fzf
@ -39,6 +39,7 @@
require 'thread' require 'thread'
require 'curses' require 'curses'
require 'set' require 'set'
require 'msgpack'
unless String.method_defined? :force_encoding unless String.method_defined? :force_encoding
class String class String
@ -74,6 +75,16 @@ class FZF
end end
end end
# TODO
def cores?
cores = `sysctl -n hw.physicalcpu 2> /dev/null`.chomp
if $?.exitstatus == 0
cores.to_i
else
1
end
end
def initialize argv, source = $stdin def initialize argv, source = $stdin
@rxflag = nil @rxflag = nil
@sort = ENV.fetch('FZF_DEFAULT_SORT', 1000).to_i @sort = ENV.fetch('FZF_DEFAULT_SORT', 1000).to_i
@ -88,6 +99,9 @@ class FZF
@filter = nil @filter = nil
@nth = nil @nth = nil
@delim = nil @delim = nil
@pids = []
@procs = cores?
@parallel_min = 10000
argv = argv =
if opts = ENV['FZF_DEFAULT_OPTS'] if opts = ENV['FZF_DEFAULT_OPTS']
@ -148,6 +162,16 @@ class FZF
@sort = $1.to_i @sort = $1.to_i
when '-e', '--extended-exact' then @extended = :exact when '-e', '--extended-exact' then @extended = :exact
when '+e', '--no-extended-exact' then @extended = nil when '+e', '--no-extended-exact' then @extended = nil
when '-p', '--parallel'
usage 1, 'number of processes required' unless pnum = argv.shift
@procs = pnum.to_i
when /^-p([1-9][0-9]*)$/, /^--parallel=([1-9][0-9]*)$/
@procs = $1.to_i
when '--parallel-min'
usage 1, 'number of processes required' unless pmin = argv.shift
@parallel_min = pmin.to_i
when /^--parallel-min=([1-9][0-9]*)$/
@parallel_min = $1.to_i
else else
usage 1, "illegal option: #{o}" usage 1, "illegal option: #{o}"
end end
@ -223,7 +247,7 @@ class FZF
end end
def filter_list list def filter_list list
matches = matcher.match(list, @filter, '', '') matches = matcher.match(list, @filter)
if @sort && matches.length <= @sort if @sort && matches.length <= @sort
matches = FZF.sort(matches) matches = FZF.sort(matches)
end end
@ -667,6 +691,103 @@ class FZF
end end
end end
def cached lists, q, prefix, suffix
cnt = 0
cached = lists.inject({}) do |sum, l|
cached = matcher.cached(l, q, prefix, suffix)
cnt += cached ? cached.length : l.length
sum[l] = cached
sum
end
[cnt, cached]
end
def search lists, q, cx
cache_count, cached = cached(lists, q, q[0, cx], q[cx..-1])
if @procs <= 1 || lists.empty? || lists.length < @procs || cache_count < @parallel_min
search_sequential lists, q, cached
else
search_parallel lists, q, cached
end
end
def search_sequential lists, q, cached
progress = 0
started_at = Time.now
found = []
skip = false
cnt = 0
lists.each do |list|
cnt += list.length
skip = @mtx.synchronize { @events[:key] }
break if skip
if (progress = 100 * cnt / @count.get) < 100 && Time.now - started_at > 0.5
render { print_info " (#{progress}%)" }
end
matches = matcher.match(list, q, cached[list])
matcher.cache list, q, matches
found.concat matches
end
return :skip if skip
found
end
def search_parallel lists, q, cached
list_map = lists.inject({}) { |h, l| h[l.object_id] = l; h }
slice_size = lists.length / @procs
slices = lists.each_slice(slice_size)
triples = slices.map do |lists|
read, write = IO.pipe
[fork do
read.close
running = true
Signal.trap('USR1') do
running = false
end
matches = {}
lists.each do |list|
break unless running
matches[list.object_id] = matcher.match(list, q, cached[list])
end
write << MessagePack.pack(matches)
exit! running
end, read, write]
end
matches = []
@pids = triples.map { |t| t.first }
mutex = Mutex.new
skip = false
triples.map { |pid, read, write|
Thread.new do
write.close
result = read.read
_, status = Process.wait2(pid)
raise if result.empty?
if status.exitstatus == 0
MessagePack.unpack(result).each do |list_object_id, data|
mutex.synchronize do
matches.concat data
end
matcher.cache list_map[list_object_id], q, data
end
else
skip = true
end
end
}.each(&:join)
skip ? :skip : matches
end
def signal_children
while pid = @pids.pop
Process.kill 'USR1', pid rescue nil
end
end
def start_search &callback def start_search &callback
Thread.new do Thread.new do
lists = [] lists = []
@ -704,39 +825,26 @@ class FZF
new_search = events[:key] || events.delete(:new) new_search = events[:key] || events.delete(:new)
user_input = events[:key] user_input = events[:key]
progress = 0
started_at = Time.now
if updated = new_search && !lists.empty? if updated = new_search && !lists.empty?
q, cx = events.delete(:key) || [q, 0] q, cx = events.delete(:key) || [q, 0]
empty = matcher.empty?(q)
unless matches = fcache[q] unless matches = fcache[q]
found = [] # Simply concats the list
skip = false if matcher.empty?(q)
cnt = 0 matches = lists.inject([]) { |cc, l| cc.concat l }
lists.each do |list| else
cnt += list.length matches ||= search(lists, q, cx)
skip = @mtx.synchronize { @events[:key] } next if matches == :skip
break if skip matches = @sort ? matches : matches.reverse
if @sort && matches.length <= @sort
if !empty && (progress = 100 * cnt / @count.get) < 100 && Time.now - started_at > 0.5
render { print_info " (#{progress}%)" }
end
found.concat(q.empty? ? list :
matcher.match(list, q, q[0, cx], q[cx..-1]))
end
next if skip
matches = @sort ? found : found.reverse
if !empty && @sort && matches.length <= @sort
matches = FZF.sort(matches) matches = FZF.sort(matches)
end end
fcache[q] = matches end
end end
# Atomic update # Atomic update
@matches.set matches @matches.set fcache[q] = matches
end#new_search end
callback = nil if callback && callback = nil if callback &&
(updated || events[:loaded]) && (updated || events[:loaded]) &&
@ -1083,7 +1191,10 @@ class FZF
upd = actions.fetch(key, actions[:default]).call(key) upd = actions.fetch(key, actions[:default]).call(key)
# Dispatch key event # Dispatch key event
emit(:key) { [@query.set(input.dup), cursor] } if upd if upd
signal_children
emit(:key) { [@query.set(input.dup), cursor] }
end
end end
end end
ensure ensure
@ -1176,9 +1287,7 @@ class FZF
end end
end end
def match list, q, prefix, suffix def cached list, q, prefix, suffix
regexp = fuzzy_regex q
cache = @caches[list.object_id] cache = @caches[list.object_id]
prefix_cache = nil prefix_cache = nil
(prefix.length - 1).downto(1) do |len| (prefix.length - 1).downto(1) do |len|
@ -1190,10 +1299,17 @@ class FZF
break if suffix_cache = cache[suffix[idx..-1]] break if suffix_cache = cache[suffix[idx..-1]]
end unless suffix.empty? end unless suffix.empty?
partial_cache = [prefix_cache, [prefix_cache, suffix_cache].compact.sort_by { |e| e.length }.first
suffix_cache].compact.sort_by { |e| e.length }.first end
cache[q] ||= (partial_cache ?
partial_cache.map { |e| e.first } : list).map { |line| def cache list, q, data
@caches[list.object_id][q] = data
end
def match list, q, partial_cache = nil
regexp = fuzzy_regex q
(partial_cache ? partial_cache.map { |e| e.first } : list).map { |line|
# Ignore errors: e.g. invalid byte sequence in UTF-8 # Ignore errors: e.g. invalid byte sequence in UTF-8
md = do_match(line, regexp) md = do_match(line, regexp)
md && [line, [md.offset(0)]] md && [line, [md.offset(0)]]
@ -1249,7 +1365,11 @@ class FZF
Regexp.new(sanitize(Regexp.escape(w)), rxflag_for(w)) Regexp.new(sanitize(Regexp.escape(w)), rxflag_for(w))
end end
def match list, q, prefix, suffix def cache list, q, data
@caches[list.object_id][Set[parse q]] = data
end
def cached list, q, prefix, suffix
regexps = parse q regexps = parse q
# Look for prefix cache # Look for prefix cache
cache = @caches[list.object_id] cache = @caches[list.object_id]
@ -1258,10 +1378,12 @@ class FZF
(prefix.length - 1).downto(1) do |len| (prefix.length - 1).downto(1) do |len|
break if prefix_cache = cache[Set[@regexps[prefix[0, len]]]] break if prefix_cache = cache[Set[@regexps[prefix[0, len]]]]
end end
prefix_cache
end
cache[Set[regexps]] ||= (prefix_cache ? def match list, q, partial_cache = nil
prefix_cache.map { |e| e.first } : regexps = parse q
list).map { |line| (partial_cache ? partial_cache.map { |e| e.first } : list).map { |line|
offsets = [] offsets = []
regexps.all? { |pair| regexps.all? { |pair|
regexp, invert = pair regexp, invert = pair