-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathsamplen.py
More file actions
31 lines (27 loc) · 877 Bytes
/
samplen.py
File metadata and controls
31 lines (27 loc) · 877 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/usr/bin/python
#
# The reservoir sampling algorithm outputs a sample of N lines from a
# file of undetermined size. It does so in a single pass, using memory
# proportional to N. These two features -- (i) a constant memory
# footprint and (ii) a capacity to operate on files of indeterminate
# size -- make it ideal for working with very large data sets common
# to event processing.
#
import sys
import random
if len(sys.argv) == 3:
input = open(sys.argv[2],'r')
elif len(sys.argv) == 2:
input = sys.stdin;
else:
sys.exit("Usage: python samplen.py <lines> <?file>")
N = int(sys.argv[1]);
sample = [];
for i,line in enumerate(input):
if i < N:
sample.append(line)
elif i >= N and random.random() < N/float(i+1):
replace = random.randint(0,len(sample)-1)
sample[replace] = line
for line in sample:
sys.stdout.write(line)