Ruby 2.7.7p221 (2022-11-24 revision 168ec2b1e5ad0e4688e963d9de019557c78feed9)
escape.c
Go to the documentation of this file.
1#include "ruby.h"
2#include "ruby/encoding.h"
3
4RUBY_EXTERN unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow);
7#define lower_hexdigits (ruby_hexdigits+0)
8#define upper_hexdigits (ruby_hexdigits+16)
9#define char_to_number(c) ruby_digit36_to_number_table[(unsigned char)(c)]
10
11static VALUE rb_cCGI, rb_mUtil, rb_mEscape;
12static ID id_accept_charset;
13
14#define HTML_ESCAPE_MAX_LEN 6
15
16static const struct {
19} html_escape_table[UCHAR_MAX+1] = {
20#define HTML_ESCAPE(c, str) [c] = {rb_strlen_lit(str), str}
21 HTML_ESCAPE('\'', "'"),
22 HTML_ESCAPE('&', "&"),
23 HTML_ESCAPE('"', """),
24 HTML_ESCAPE('<', "&lt;"),
25 HTML_ESCAPE('>', "&gt;"),
26#undef HTML_ESCAPE
27};
28
29static inline void
30preserve_original_state(VALUE orig, VALUE dest)
31{
32 rb_enc_associate(dest, rb_enc_get(orig));
33}
34
35static VALUE
36optimized_escape_html(VALUE str)
37{
38 VALUE vbuf;
39 typedef char escape_buf[HTML_ESCAPE_MAX_LEN];
40 char *buf = *ALLOCV_N(escape_buf, vbuf, RSTRING_LEN(str));
41 const char *cstr = RSTRING_PTR(str);
42 const char *end = cstr + RSTRING_LEN(str);
43
44 char *dest = buf;
45 while (cstr < end) {
46 const unsigned char c = *cstr++;
47 uint8_t len = html_escape_table[c].len;
48 if (len) {
49 memcpy(dest, html_escape_table[c].str, len);
50 dest += len;
51 }
52 else {
53 *dest++ = c;
54 }
55 }
56
57 VALUE escaped;
58 if (RSTRING_LEN(str) < (dest - buf)) {
59 escaped = rb_str_new(buf, dest - buf);
60 preserve_original_state(str, escaped);
61 }
62 else {
63 escaped = rb_str_dup(str);
64 }
65 ALLOCV_END(vbuf);
66 return escaped;
67}
68
69static VALUE
70optimized_unescape_html(VALUE str)
71{
72 enum {UNICODE_MAX = 0x10ffff};
74 unsigned long charlimit = (strcasecmp(rb_enc_name(enc), "UTF-8") == 0 ? UNICODE_MAX :
75 strcasecmp(rb_enc_name(enc), "ISO-8859-1") == 0 ? 256 :
76 128);
77 long i, len, beg = 0;
78 size_t clen, plen;
79 int overflow;
80 const char *cstr;
81 char buf[6];
82 VALUE dest = 0;
83
85 cstr = RSTRING_PTR(str);
86
87 for (i = 0; i < len; i++) {
88 unsigned long cc;
89 char c = cstr[i];
90 if (c != '&') continue;
91 plen = i - beg;
92 if (++i >= len) break;
93 c = (unsigned char)cstr[i];
94#define MATCH(s) (len - i >= (int)rb_strlen_lit(s) && \
95 memcmp(&cstr[i], s, rb_strlen_lit(s)) == 0 && \
96 (i += rb_strlen_lit(s) - 1, 1))
97 switch (c) {
98 case 'a':
99 ++i;
100 if (MATCH("pos;")) {
101 c = '\'';
102 }
103 else if (MATCH("mp;")) {
104 c = '&';
105 }
106 else continue;
107 break;
108 case 'q':
109 ++i;
110 if (MATCH("uot;")) {
111 c = '"';
112 }
113 else continue;
114 break;
115 case 'g':
116 ++i;
117 if (MATCH("t;")) {
118 c = '>';
119 }
120 else continue;
121 break;
122 case 'l':
123 ++i;
124 if (MATCH("t;")) {
125 c = '<';
126 }
127 else continue;
128 break;
129 case '#':
130 if (len - ++i >= 2 && ISDIGIT(cstr[i])) {
131 cc = ruby_scan_digits(&cstr[i], len-i, 10, &clen, &overflow);
132 }
133 else if ((cstr[i] == 'x' || cstr[i] == 'X') && len - ++i >= 2 && ISXDIGIT(cstr[i])) {
134 cc = ruby_scan_digits(&cstr[i], len-i, 16, &clen, &overflow);
135 }
136 else continue;
137 i += clen;
138 if (overflow || cc >= charlimit || cstr[i] != ';') continue;
139 if (!dest) {
140 dest = rb_str_buf_new(len);
141 }
142 rb_str_cat(dest, cstr + beg, plen);
143 if (charlimit > 256) {
145 }
146 else {
147 c = (unsigned char)cc;
148 rb_str_cat(dest, &c, 1);
149 }
150 beg = i + 1;
151 continue;
152 default:
153 --i;
154 continue;
155 }
156 if (!dest) {
157 dest = rb_str_buf_new(len);
158 }
159 rb_str_cat(dest, cstr + beg, plen);
160 rb_str_cat(dest, &c, 1);
161 beg = i + 1;
162 }
163
164 if (dest) {
165 rb_str_cat(dest, cstr + beg, len - beg);
166 preserve_original_state(str, dest);
167 return dest;
168 }
169 else {
170 return rb_str_dup(str);
171 }
172}
173
174static unsigned char
175url_unreserved_char(unsigned char c)
176{
177 switch (c) {
178 case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9':
179 case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g': case 'h': case 'i': case 'j':
180 case 'k': case 'l': case 'm': case 'n': case 'o': case 'p': case 'q': case 'r': case 's': case 't':
181 case 'u': case 'v': case 'w': case 'x': case 'y': case 'z':
182 case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G': case 'H': case 'I': case 'J':
183 case 'K': case 'L': case 'M': case 'N': case 'O': case 'P': case 'Q': case 'R': case 'S': case 'T':
184 case 'U': case 'V': case 'W': case 'X': case 'Y': case 'Z':
185 case '-': case '.': case '_': case '~':
186 return 1;
187 default:
188 break;
189 }
190 return 0;
191}
192
193static VALUE
194optimized_escape(VALUE str)
195{
196 long i, len, beg = 0;
197 VALUE dest = 0;
198 const char *cstr;
199 char buf[4] = {'%'};
200
202 cstr = RSTRING_PTR(str);
203
204 for (i = 0; i < len; ++i) {
205 const unsigned char c = (unsigned char)cstr[i];
206 if (!url_unreserved_char(c)) {
207 if (!dest) {
208 dest = rb_str_buf_new(len);
209 }
210
211 rb_str_cat(dest, cstr + beg, i - beg);
212 beg = i + 1;
213
214 if (c == ' ') {
215 rb_str_cat_cstr(dest, "+");
216 }
217 else {
218 buf[1] = upper_hexdigits[(c >> 4) & 0xf];
219 buf[2] = upper_hexdigits[c & 0xf];
220 rb_str_cat(dest, buf, 3);
221 }
222 }
223 }
224
225 if (dest) {
226 rb_str_cat(dest, cstr + beg, len - beg);
227 preserve_original_state(str, dest);
228 return dest;
229 }
230 else {
231 return rb_str_dup(str);
232 }
233}
234
235static VALUE
236optimized_unescape(VALUE str, VALUE encoding)
237{
238 long i, len, beg = 0;
239 VALUE dest = 0;
240 const char *cstr;
241 rb_encoding *enc = rb_to_encoding(encoding);
242 int cr, origenc, encidx = rb_enc_to_index(enc);
243
245 cstr = RSTRING_PTR(str);
246
247 for (i = 0; i < len; ++i) {
248 char buf[1];
249 const char c = cstr[i];
250 int clen = 0;
251 if (c == '%') {
252 if (i + 3 > len) break;
253 if (!ISXDIGIT(cstr[i+1])) continue;
254 if (!ISXDIGIT(cstr[i+2])) continue;
255 buf[0] = ((char_to_number(cstr[i+1]) << 4)
256 | char_to_number(cstr[i+2]));
257 clen = 2;
258 }
259 else if (c == '+') {
260 buf[0] = ' ';
261 }
262 else {
263 continue;
264 }
265
266 if (!dest) {
267 dest = rb_str_buf_new(len);
268 }
269
270 rb_str_cat(dest, cstr + beg, i - beg);
271 i += clen;
272 beg = i + 1;
273
274 rb_str_cat(dest, buf, 1);
275 }
276
277 if (dest) {
278 rb_str_cat(dest, cstr + beg, len - beg);
279 preserve_original_state(str, dest);
281 }
282 else {
283 dest = rb_str_dup(str);
284 cr = ENC_CODERANGE(str);
285 }
286 origenc = rb_enc_get_index(str);
287 if (origenc != encidx) {
288 rb_enc_associate_index(dest, encidx);
290 rb_enc_associate_index(dest, origenc);
291 if (cr != ENC_CODERANGE_UNKNOWN)
292 ENC_CODERANGE_SET(dest, cr);
293 }
294 }
295 return dest;
296}
297
298/*
299 * call-seq:
300 * CGI.escapeHTML(string) -> string
301 *
302 * Returns HTML-escaped string.
303 *
304 */
305static VALUE
306cgiesc_escape_html(VALUE self, VALUE str)
307{
309
311 return optimized_escape_html(str);
312 }
313 else {
314 return rb_call_super(1, &str);
315 }
316}
317
318/*
319 * call-seq:
320 * CGI.unescapeHTML(string) -> string
321 *
322 * Returns HTML-unescaped string.
323 *
324 */
325static VALUE
326cgiesc_unescape_html(VALUE self, VALUE str)
327{
329
331 return optimized_unescape_html(str);
332 }
333 else {
334 return rb_call_super(1, &str);
335 }
336}
337
338/*
339 * call-seq:
340 * CGI.escape(string) -> string
341 *
342 * Returns URL-escaped string.
343 *
344 */
345static VALUE
346cgiesc_escape(VALUE self, VALUE str)
347{
349
351 return optimized_escape(str);
352 }
353 else {
354 return rb_call_super(1, &str);
355 }
356}
357
358static VALUE
359accept_charset(int argc, VALUE *argv, VALUE self)
360{
361 if (argc > 0)
362 return argv[0];
363 return rb_cvar_get(CLASS_OF(self), id_accept_charset);
364}
365
366/*
367 * call-seq:
368 * CGI.unescape(string, encoding=@@accept_charset) -> string
369 *
370 * Returns URL-unescaped string.
371 *
372 */
373static VALUE
374cgiesc_unescape(int argc, VALUE *argv, VALUE self)
375{
376 VALUE str = (rb_check_arity(argc, 1, 2), argv[0]);
377
379
381 VALUE enc = accept_charset(argc-1, argv+1, self);
382 return optimized_unescape(str, enc);
383 }
384 else {
385 return rb_call_super(argc, argv);
386 }
387}
388
389void
391{
392 id_accept_charset = rb_intern_const("@@accept_charset");
393 InitVM(escape);
394}
395
396void
398{
399 rb_cCGI = rb_define_class("CGI", rb_cObject);
400 rb_mEscape = rb_define_module_under(rb_cCGI, "Escape");
401 rb_mUtil = rb_define_module_under(rb_cCGI, "Util");
402 rb_define_method(rb_mEscape, "escapeHTML", cgiesc_escape_html, 1);
403 rb_define_method(rb_mEscape, "unescapeHTML", cgiesc_unescape_html, 1);
404 rb_define_method(rb_mEscape, "escape", cgiesc_escape, 1);
405 rb_define_method(rb_mEscape, "unescape", cgiesc_unescape, -1);
406 rb_prepend_module(rb_mUtil, rb_mEscape);
407 rb_extend_object(rb_cCGI, rb_mEscape);
408}
int rb_enc_get_index(VALUE obj)
Definition: encoding.c:779
VALUE rb_enc_associate(VALUE obj, rb_encoding *enc)
Definition: encoding.c:866
rb_encoding * rb_enc_get(VALUE obj)
Definition: encoding.c:872
int rb_enc_to_index(rb_encoding *enc)
Definition: encoding.c:125
rb_encoding * rb_to_encoding(VALUE enc)
Definition: encoding.c:245
VALUE rb_enc_associate_index(VALUE obj, int idx)
Definition: encoding.c:838
#define rb_enc_mbcput(c, buf, enc)
Definition: encoding.h:217
#define ENC_CODERANGE_CLEAN_P(cr)
Definition: encoding.h:107
#define rb_enc_str_asciicompat_p(str)
Definition: encoding.h:257
int rb_enc_str_coderange(VALUE)
Definition: string.c:657
#define ENC_CODERANGE(obj)
Definition: encoding.h:108
#define ENC_CODERANGE_UNKNOWN
Definition: encoding.h:103
#define rb_enc_name(enc)
Definition: encoding.h:177
#define ENC_CODERANGE_SET(obj, cr)
Definition: encoding.h:110
#define MATCH(s)
void Init_escape(void)
Definition: escape.c:390
uint8_t len
Definition: escape.c:17
RUBY_EXTERN unsigned long ruby_scan_digits(const char *str, ssize_t len, int base, size_t *retlen, int *overflow)
Definition: util.c:97
RUBY_EXTERN const signed char ruby_digit36_to_number_table[]
Definition: escape.c:6
#define HTML_ESCAPE(c, str)
char str[HTML_ESCAPE_MAX_LEN+1]
Definition: escape.c:18
RUBY_EXTERN const char ruby_hexdigits[]
Definition: escape.c:5
#define upper_hexdigits
Definition: escape.c:8
void InitVM_escape(void)
Definition: escape.c:397
#define HTML_ESCAPE_MAX_LEN
Definition: escape.c:14
#define char_to_number(c)
Definition: escape.c:9
void rb_extend_object(VALUE, VALUE)
Extend the object with the module.
Definition: eval.c:1701
VALUE rb_define_class(const char *, VALUE)
Defines a top-level class.
Definition: class.c:662
void rb_prepend_module(VALUE, VALUE)
Definition: class.c:1017
VALUE rb_define_module_under(VALUE, const char *)
Definition: class.c:810
VALUE rb_cObject
Object class.
Definition: ruby.h:2012
#define UNICODE_MAX
Definition: nkf.c:427
unsigned char buf[MIME_BUF_SIZE]
Definition: nkf.c:4322
unsigned int OnigCodePoint
Definition: onigmo.h:80
use StringValue() instead")))
#define RSTRING_LEN(str)
#define ALLOCV_END(v)
__uint8_t uint8_t
#define RSTRING_PTR(str)
#define UCHAR_MAX
#define ISXDIGIT(c)
#define rb_str_new(str, len)
VALUE rb_str_cat(VALUE, const char *, long)
Definition: string.c:2812
#define rb_intern_const(str)
VALUE rb_str_buf_new(long)
Definition: string.c:1315
uint32_t i
#define char
int strcasecmp(const char *, const char *) __attribute__((__pure__))
#define rb_str_cat_cstr(str, ptr)
#define ALLOCV_N(type, v, n)
VALUE rb_cvar_get(VALUE, ID)
Definition: variable.c:3107
const struct rb_call_cache * cc
#define ISDIGIT(c)
void * memcpy(void *__restrict__, const void *__restrict__, size_t)
const VALUE * argv
_ssize_t ssize_t
#define CLASS_OF(v)
#define rb_check_arity
#define RUBY_EXTERN
VALUE rb_str_dup(VALUE)
Definition: string.c:1516
unsigned long ID
#define InitVM(ext)
void rb_define_method(VALUE, const char *, VALUE(*)(), int)
VALUE rb_call_super(int, const VALUE *)
Definition: vm_eval.c:306
unsigned long VALUE
Definition: ruby.h:102