Skip to content

Commit 2b9e3ba

Browse files
committed
Implement CGI.url_encode and CGI.url_decode
[Feature #18822] Ruby is somewhat missing an RFC 3986 compliant escape method.
1 parent eaa0cc0 commit 2b9e3ba

3 files changed

Lines changed: 168 additions & 25 deletions

File tree

ext/cgi/escape/escape.c

Lines changed: 62 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -200,7 +200,7 @@ url_unreserved_char(unsigned char c)
200200
}
201201

202202
static VALUE
203-
optimized_escape(VALUE str)
203+
optimized_escape(VALUE str, int plus_escape)
204204
{
205205
long i, len, beg = 0;
206206
VALUE dest = 0;
@@ -220,7 +220,7 @@ optimized_escape(VALUE str)
220220
rb_str_cat(dest, cstr + beg, i - beg);
221221
beg = i + 1;
222222

223-
if (c == ' ') {
223+
if (plus_escape && c == ' ') {
224224
rb_str_cat_cstr(dest, "+");
225225
}
226226
else {
@@ -242,7 +242,7 @@ optimized_escape(VALUE str)
242242
}
243243

244244
static VALUE
245-
optimized_unescape(VALUE str, VALUE encoding)
245+
optimized_unescape(VALUE str, VALUE encoding, int unescape_plus)
246246
{
247247
long i, len, beg = 0;
248248
VALUE dest = 0;
@@ -265,7 +265,7 @@ optimized_unescape(VALUE str, VALUE encoding)
265265
| char_to_number(cstr[i+2]));
266266
clen = 2;
267267
}
268-
else if (c == '+') {
268+
else if (unescape_plus && c == '+') {
269269
buf[0] = ' ';
270270
}
271271
else {
@@ -348,19 +348,19 @@ cgiesc_unescape_html(VALUE self, VALUE str)
348348
* call-seq:
349349
* CGI.escape(string) -> string
350350
*
351-
* Returns URL-escaped string.
351+
* Returns URL-escaped string (+application/x-www-form-urlencoded+).
352352
*
353353
*/
354354
static VALUE
355-
cgiesc_escape(VALUE self, VALUE str)
355+
cgiesc_encode_www_form_component(VALUE self, VALUE str)
356356
{
357357
StringValue(str);
358358

359359
if (rb_enc_str_asciicompat_p(str)) {
360-
return optimized_escape(str);
360+
return optimized_escape(str, 1);
361361
}
362362
else {
363-
return rb_call_super(1, &str);
363+
return rb_call_super(1, &str);
364364
}
365365
}
366366

@@ -376,22 +376,65 @@ accept_charset(int argc, VALUE *argv, VALUE self)
376376
* call-seq:
377377
* CGI.unescape(string, encoding=@@accept_charset) -> string
378378
*
379-
* Returns URL-unescaped string.
379+
* Returns URL-unescaped string (+application/x-www-form-urlencoded+).
380+
*
381+
*/
382+
static VALUE
383+
cgiesc_decode_www_form_component(int argc, VALUE *argv, VALUE self)
384+
{
385+
VALUE str = (rb_check_arity(argc, 1, 2), argv[0]);
386+
387+
StringValue(str);
388+
389+
if (rb_enc_str_asciicompat_p(str)) {
390+
VALUE enc = accept_charset(argc-1, argv+1, self);
391+
return optimized_unescape(str, enc, 1);
392+
}
393+
else {
394+
return rb_call_super(argc, argv);
395+
}
396+
}
397+
398+
/*
399+
* call-seq:
400+
* CGI.url_encode(string) -> string
401+
*
402+
* Returns URL-escaped string following RFC 3986.
403+
*
404+
*/
405+
static VALUE
406+
cgiesc_url_encode(VALUE self, VALUE str)
407+
{
408+
StringValue(str);
409+
410+
if (rb_enc_str_asciicompat_p(str)) {
411+
return optimized_escape(str, 0);
412+
}
413+
else {
414+
return rb_call_super(1, &str);
415+
}
416+
}
417+
418+
/*
419+
* call-seq:
420+
* CGI.url_decode(string, encoding=@@accept_charset) -> string
421+
*
422+
* Returns URL-unescaped string following RFC 3986.
380423
*
381424
*/
382425
static VALUE
383-
cgiesc_unescape(int argc, VALUE *argv, VALUE self)
426+
cgiesc_url_decode(int argc, VALUE *argv, VALUE self)
384427
{
385428
VALUE str = (rb_check_arity(argc, 1, 2), argv[0]);
386429

387430
StringValue(str);
388431

389432
if (rb_enc_str_asciicompat_p(str)) {
390-
VALUE enc = accept_charset(argc-1, argv+1, self);
391-
return optimized_unescape(str, enc);
433+
VALUE enc = accept_charset(argc-1, argv+1, self);
434+
return optimized_unescape(str, enc, 0);
392435
}
393436
else {
394-
return rb_call_super(argc, argv);
437+
return rb_call_super(argc, argv);
395438
}
396439
}
397440

@@ -414,8 +457,12 @@ InitVM_escape(void)
414457
rb_mUtil = rb_define_module_under(rb_cCGI, "Util");
415458
rb_define_method(rb_mEscape, "escapeHTML", cgiesc_escape_html, 1);
416459
rb_define_method(rb_mEscape, "unescapeHTML", cgiesc_unescape_html, 1);
417-
rb_define_method(rb_mEscape, "escape", cgiesc_escape, 1);
418-
rb_define_method(rb_mEscape, "unescape", cgiesc_unescape, -1);
460+
rb_define_method(rb_mEscape, "url_encode", cgiesc_url_encode, 1);
461+
rb_define_method(rb_mEscape, "url_decode", cgiesc_url_decode, -1);
462+
rb_define_method(rb_mEscape, "encode_www_form_component", cgiesc_encode_www_form_component, 1);
463+
rb_define_alias(rb_mEscape, "escape", "encode_www_form_component");
464+
rb_define_method(rb_mEscape, "decode_www_form_component", cgiesc_decode_www_form_component, -1);
465+
rb_define_alias(rb_mEscape, "unescape", "decode_www_form_component");
419466
rb_prepend_module(rb_mUtil, rb_mEscape);
420467
rb_extend_object(rb_cCGI, rb_mEscape);
421468
}

lib/cgi/util.rb

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,24 +5,59 @@ module Util; end
55
extend Util
66
end
77
module CGI::Util
8-
@@accept_charset="UTF-8" unless defined?(@@accept_charset)
9-
# URL-encode a string.
8+
@@accept_charset = Encoding::UTF_8 unless defined?(@@accept_charset)
9+
10+
# URL-encode a string into application/x-www-form-urlencoded.
11+
#.Space characters (+" "+) are encoded with plus signs (+"+"+)
1012
# url_encoded_string = CGI.escape("'Stop!' said Fred")
1113
# # => "%27Stop%21%27+said+Fred"
12-
def escape(string)
14+
def encode_www_form_component(string)
1315
encoding = string.encoding
14-
string.b.gsub(/([^ a-zA-Z0-9_.\-~]+)/) do |m|
16+
buffer = string.b
17+
buffer.gsub!(/([^ a-zA-Z0-9_.\-~]+)/) do |m|
1518
'%' + m.unpack('H2' * m.bytesize).join('%').upcase
16-
end.tr(' ', '+').force_encoding(encoding)
19+
end
20+
buffer.tr!(' ', '+')
21+
buffer.force_encoding(encoding)
1722
end
23+
alias_method :escape, :encode_www_form_component
1824

19-
# URL-decode a string with encoding(optional).
25+
# URL-decode an application/x-www-form-urlencoded string with encoding(optional).
2026
# string = CGI.unescape("%27Stop%21%27+said+Fred")
2127
# # => "'Stop!' said Fred"
22-
def unescape(string,encoding=@@accept_charset)
23-
str=string.tr('+', ' ').b.gsub(/((?:%[0-9a-fA-F]{2})+)/) do |m|
28+
def decode_www_form_component(string, encoding = @@accept_charset)
29+
str = string.tr('+', ' ')
30+
str = str.b
31+
str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m|
32+
[m.delete('%')].pack('H*')
33+
end
34+
str.force_encoding(encoding)
35+
str.valid_encoding? ? str : str.force_encoding(string.encoding)
36+
end
37+
alias_method :unescape, :decode_www_form_component
38+
39+
# URL-encode a string following RFC 3986
40+
#.Space characters (+" "+) are encoded with (+"%20"+)
41+
# url_encoded_string = CGI.escape("'Stop!' said Fred")
42+
# # => "%27Stop%21%27%20said%20Fred"
43+
def url_encode(string)
44+
encoding = string.encoding
45+
buffer = string.b
46+
buffer.gsub!(/([^a-zA-Z0-9_.\-~]+)/) do |m|
47+
'%' + m.unpack('H2' * m.bytesize).join('%').upcase
48+
end
49+
buffer.force_encoding(encoding)
50+
end
51+
52+
# URL-decode a string following RFC 3986 with encoding(optional).
53+
# string = CGI.unescape("%27Stop%21%27+said%20Fred")
54+
# # => "'Stop!'+said Fred"
55+
def url_decode(string, encoding = @@accept_charset)
56+
str = string.b
57+
str.gsub!(/((?:%[0-9a-fA-F]{2})+)/) do |m|
2458
[m.delete('%')].pack('H*')
25-
end.force_encoding(encoding)
59+
end
60+
str.force_encoding(encoding)
2661
str.valid_encoding? ? str : str.force_encoding(string.encoding)
2762
end
2863

test/cgi/test_cgi_util.rb

Lines changed: 62 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,12 +23,16 @@ def teardown
2323
ENV.update(@environ)
2424
end
2525

26-
2726
def test_cgi_escape
2827
assert_equal('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93', CGI.escape(@str1))
2928
assert_equal('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93'.ascii_only?, CGI.escape(@str1).ascii_only?) if defined?(::Encoding)
3029
end
3130

31+
def test_cgi_encode_www_form_component
32+
assert_equal('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93', CGI.encode_www_form_component(@str1))
33+
assert_equal('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93'.ascii_only?, CGI.encode_www_form_component(@str1).ascii_only?) if defined?(::Encoding)
34+
end
35+
3236
def test_cgi_escape_with_unreserved_characters
3337
assert_equal("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~",
3438
CGI.escape("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"),
@@ -54,6 +58,15 @@ def test_cgi_unescape
5458
assert_equal("\u{30E1 30E2 30EA 691C 7D22}", CGI.unescape("\u{30E1 30E2 30EA}%E6%A4%9C%E7%B4%A2"))
5559
end
5660

61+
def test_decode_www_form_component
62+
str = CGI.decode_www_form_component('%26%3C%3E%22+%E3%82%86%E3%82%93%E3%82%86%E3%82%93')
63+
assert_equal(@str1, str)
64+
return unless defined?(::Encoding)
65+
66+
assert_equal(@str1.encoding, str.encoding)
67+
assert_equal("\u{30E1 30E2 30EA 691C 7D22}", CGI.decode_www_form_component("\u{30E1 30E2 30EA}%E6%A4%9C%E7%B4%A2"))
68+
end
69+
5770
def test_cgi_unescape_preserve_encoding
5871
assert_equal(Encoding::US_ASCII, CGI.unescape("%C0%3C%3C".dup.force_encoding("US-ASCII")).encoding)
5972
assert_equal(Encoding::ASCII_8BIT, CGI.unescape("%C0%3C%3C".dup.force_encoding("ASCII-8BIT")).encoding)
@@ -70,6 +83,54 @@ def test_cgi_unescape_accept_charset
7083
end;
7184
end
7285

86+
def test_cgi_url_encode
87+
assert_equal('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93', CGI.url_encode(@str1))
88+
assert_equal('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93'.ascii_only?, CGI.url_encode(@str1).ascii_only?) if defined?(::Encoding)
89+
end
90+
91+
def test_cgi_url_encode_with_unreserved_characters
92+
assert_equal("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~",
93+
CGI.url_encode("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-._~"),
94+
"should not url_encode any unreserved characters, as per RFC3986 Section 2.3")
95+
end
96+
97+
def test_cgi_url_encode_with_invalid_byte_sequence
98+
assert_equal('%C0%3C%3C', CGI.url_encode("\xC0\<\<".dup.force_encoding("UTF-8")))
99+
end
100+
101+
def test_cgi_url_encode_preserve_encoding
102+
assert_equal(Encoding::US_ASCII, CGI.url_encode("\xC0\<\<".dup.force_encoding("US-ASCII")).encoding)
103+
assert_equal(Encoding::ASCII_8BIT, CGI.url_encode("\xC0\<\<".dup.force_encoding("ASCII-8BIT")).encoding)
104+
assert_equal(Encoding::UTF_8, CGI.url_encode("\xC0\<\<".dup.force_encoding("UTF-8")).encoding)
105+
end
106+
107+
def test_cgi_url_decode
108+
str = CGI.url_decode('%26%3C%3E%22%20%E3%82%86%E3%82%93%E3%82%86%E3%82%93')
109+
assert_equal(@str1, str)
110+
return unless defined?(::Encoding)
111+
112+
assert_equal("foo+bar", CGI.url_decode("foo+bar"))
113+
114+
assert_equal(@str1.encoding, str.encoding)
115+
assert_equal("\u{30E1 30E2 30EA 691C 7D22}", CGI.url_decode("\u{30E1 30E2 30EA}%E6%A4%9C%E7%B4%A2"))
116+
end
117+
118+
def test_cgi_url_decode_preserve_encoding
119+
assert_equal(Encoding::US_ASCII, CGI.url_decode("%C0%3C%3C".dup.force_encoding("US-ASCII")).encoding)
120+
assert_equal(Encoding::ASCII_8BIT, CGI.url_decode("%C0%3C%3C".dup.force_encoding("ASCII-8BIT")).encoding)
121+
assert_equal(Encoding::UTF_8, CGI.url_decode("%C0%3C%3C".dup.force_encoding("UTF-8")).encoding)
122+
end
123+
124+
def test_cgi_url_decode_accept_charset
125+
return unless defined?(::Encoding)
126+
127+
assert_raise(TypeError) {CGI.url_decode('', nil)}
128+
assert_separately(%w[-rcgi/util], "#{<<-"begin;"}\n#{<<-"end;"}")
129+
begin;
130+
assert_equal("", CGI.url_decode(''))
131+
end;
132+
end
133+
73134
def test_cgi_pretty
74135
assert_equal("<HTML>\n <BODY>\n </BODY>\n</HTML>\n",CGI.pretty("<HTML><BODY></BODY></HTML>"))
75136
assert_equal("<HTML>\n\t<BODY>\n\t</BODY>\n</HTML>\n",CGI.pretty("<HTML><BODY></BODY></HTML>","\t"))

0 commit comments

Comments
 (0)