forked from infochimps-labs/wukong
-
Notifications
You must be signed in to change notification settings - Fork 5
Expand file tree
/
Copy pathsample_records.rb
More file actions
executable file
·33 lines (29 loc) · 928 Bytes
/
sample_records.rb
File metadata and controls
executable file
·33 lines (29 loc) · 928 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
#!/usr/bin/env ruby
require 'rubygems'
require 'wukong/script'
Settings.define :sampling_fraction, :type => Float, :required => true, :description => "floating-point number between 0 and 1 giving the fraction of lines to emit: at sampling_fraction=1 all records are emitted, at 0 none are."
#
# Probabilistically emit some fraction of record/lines
#
# Set the sampling fraction at the command line using the
# --sampling_fraction=
# option: for example, to take a random 1/1000th of the lines in huge_files,
# ./examples/sample_records.rb --sampling_fraction=0.001 --run huge_files sampled_files
#
class Mapper < Wukong::Streamer::LineStreamer
include Wukong::Streamer::Filter
#
# randomly decide to emit +sampling_fraction+ fraction of lines
#
def emit? line
rand < Settings.sampling_fraction
end
end
#
# Executes the script
#
Wukong.run( Mapper,
nil,
:reduce_tasks => 0,
:reuse_jvms => true
)