Ruby 3.3.0p0 (2023-12-25 revision 5124f9ac7513eb590c37717337c430cb93caa151)
string.c
1/**********************************************************************
2
3 string.c -
4
5 $Author$
6 created at: Mon Aug 9 17:12:58 JST 1993
7
8 Copyright (C) 1993-2007 Yukihiro Matsumoto
9 Copyright (C) 2000 Network Applied Communication Laboratory, Inc.
10 Copyright (C) 2000 Information-technology Promotion Agency, Japan
11
12**********************************************************************/
13
14#include "ruby/internal/config.h"
15
16#include <ctype.h>
17#include <errno.h>
18#include <math.h>
19
20#ifdef HAVE_UNISTD_H
21# include <unistd.h>
22#endif
23
24#include "debug_counter.h"
25#include "encindex.h"
26#include "id.h"
27#include "internal.h"
28#include "internal/array.h"
29#include "internal/compar.h"
30#include "internal/compilers.h"
31#include "internal/encoding.h"
32#include "internal/error.h"
33#include "internal/gc.h"
34#include "internal/numeric.h"
35#include "internal/object.h"
36#include "internal/proc.h"
37#include "internal/re.h"
38#include "internal/sanitizers.h"
39#include "internal/string.h"
40#include "internal/transcode.h"
41#include "probes.h"
42#include "ruby/encoding.h"
43#include "ruby/re.h"
44#include "ruby/util.h"
45#include "ruby_assert.h"
46#include "vm_sync.h"
47
48#if defined HAVE_CRYPT_R
49# if defined HAVE_CRYPT_H
50# include <crypt.h>
51# endif
52#elif !defined HAVE_CRYPT
53# include "missing/crypt.h"
54# define HAVE_CRYPT_R 1
55#endif
56
57#define BEG(no) (regs->beg[(no)])
58#define END(no) (regs->end[(no)])
59
60#undef rb_str_new
61#undef rb_usascii_str_new
62#undef rb_utf8_str_new
63#undef rb_enc_str_new
64#undef rb_str_new_cstr
65#undef rb_usascii_str_new_cstr
66#undef rb_utf8_str_new_cstr
67#undef rb_enc_str_new_cstr
68#undef rb_external_str_new_cstr
69#undef rb_locale_str_new_cstr
70#undef rb_str_dup_frozen
71#undef rb_str_buf_new_cstr
72#undef rb_str_buf_cat
73#undef rb_str_buf_cat2
74#undef rb_str_cat2
75#undef rb_str_cat_cstr
76#undef rb_fstring_cstr
77
80
81/* FLAGS of RString
82 *
83 * 1: RSTRING_NOEMBED
84 * 2: STR_SHARED (== ELTS_SHARED)
85 * 5: STR_SHARED_ROOT (RSTRING_NOEMBED==1 && STR_SHARED == 0, there may be
86 * other strings that rely on this string's buffer)
87 * 6: STR_BORROWED (when RSTRING_NOEMBED==1 && klass==0, unsafe to recycle
88 * early, specific to rb_str_tmp_frozen_{acquire,release})
89 * 7: STR_TMPLOCK (set when a pointer to the buffer is passed to syscall
90 * such as read(2). Any modification and realloc is prohibited)
91 *
92 * 8-9: ENC_CODERANGE (2 bits)
93 * 10-16: ENCODING (7 bits == 128)
94 * 17: RSTRING_FSTR
95 * 18: STR_NOFREE (do not free this string's buffer when a String is freed.
96 * used for a string object based on C string literal)
97 * 19: STR_FAKESTR (when RVALUE is not managed by GC. Typically, the string
98 * object header is temporarily allocated on C stack)
99 */
100
101#define RUBY_MAX_CHAR_LEN 16
102#define STR_SHARED_ROOT FL_USER5
103#define STR_BORROWED FL_USER6
104#define STR_TMPLOCK FL_USER7
105#define STR_NOFREE FL_USER18
106#define STR_FAKESTR FL_USER19
107
108#define STR_SET_NOEMBED(str) do {\
109 FL_SET((str), STR_NOEMBED);\
110 FL_UNSET((str), STR_SHARED | STR_SHARED_ROOT | STR_BORROWED);\
111} while (0)
112#define STR_SET_EMBED(str) FL_UNSET((str), (STR_NOEMBED|STR_NOFREE))
113
114#define STR_SET_LEN(str, n) do { \
115 RSTRING(str)->len = (n); \
116} while (0)
117
118static inline bool
119str_enc_fastpath(VALUE str)
120{
121 // The overwhelming majority of strings are in one of these 3 encodings.
122 switch (ENCODING_GET_INLINED(str)) {
123 case ENCINDEX_ASCII_8BIT:
124 case ENCINDEX_UTF_8:
125 case ENCINDEX_US_ASCII:
126 return true;
127 default:
128 return false;
129 }
130}
131
132#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str))))
133#define TERM_FILL(ptr, termlen) do {\
134 char *const term_fill_ptr = (ptr);\
135 const int term_fill_len = (termlen);\
136 *term_fill_ptr = '\0';\
137 if (UNLIKELY(term_fill_len > 1))\
138 memset(term_fill_ptr, 0, term_fill_len);\
139} while (0)
140
141#define RESIZE_CAPA(str,capacity) do {\
142 const int termlen = TERM_LEN(str);\
143 RESIZE_CAPA_TERM(str,capacity,termlen);\
144} while (0)
145#define RESIZE_CAPA_TERM(str,capacity,termlen) do {\
146 if (STR_EMBED_P(str)) {\
147 if (str_embed_capa(str) < capacity + termlen) {\
148 char *const tmp = ALLOC_N(char, (size_t)(capacity) + (termlen));\
149 const long tlen = RSTRING_LEN(str);\
150 memcpy(tmp, RSTRING_PTR(str), tlen);\
151 RSTRING(str)->as.heap.ptr = tmp;\
152 RSTRING(str)->len = tlen;\
153 STR_SET_NOEMBED(str);\
154 RSTRING(str)->as.heap.aux.capa = (capacity);\
155 }\
156 }\
157 else {\
158 assert(!FL_TEST((str), STR_SHARED)); \
159 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char, \
160 (size_t)(capacity) + (termlen), STR_HEAP_SIZE(str)); \
161 RSTRING(str)->as.heap.aux.capa = (capacity);\
162 }\
163} while (0)
164
165#define STR_SET_SHARED(str, shared_str) do { \
166 if (!FL_TEST(str, STR_FAKESTR)) { \
167 assert(RSTRING_PTR(shared_str) <= RSTRING_PTR(str)); \
168 assert(RSTRING_PTR(str) <= RSTRING_PTR(shared_str) + RSTRING_LEN(shared_str)); \
169 RB_OBJ_WRITE((str), &RSTRING(str)->as.heap.aux.shared, (shared_str)); \
170 FL_SET((str), STR_SHARED); \
171 FL_SET((shared_str), STR_SHARED_ROOT); \
172 if (RBASIC_CLASS((shared_str)) == 0) /* for CoW-friendliness */ \
173 FL_SET_RAW((shared_str), STR_BORROWED); \
174 } \
175} while (0)
176
177#define STR_HEAP_PTR(str) (RSTRING(str)->as.heap.ptr)
178#define STR_HEAP_SIZE(str) ((size_t)RSTRING(str)->as.heap.aux.capa + TERM_LEN(str))
179/* TODO: include the terminator size in capa. */
180
181#define STR_ENC_GET(str) get_encoding(str)
182
183#if !defined SHARABLE_MIDDLE_SUBSTRING
184# define SHARABLE_MIDDLE_SUBSTRING 0
185#endif
186#if !SHARABLE_MIDDLE_SUBSTRING
187#define SHARABLE_SUBSTRING_P(beg, len, end) ((beg) + (len) == (end))
188#else
189#define SHARABLE_SUBSTRING_P(beg, len, end) 1
190#endif
191
192
193static inline long
194str_embed_capa(VALUE str)
195{
196 return rb_gc_obj_slot_size(str) - offsetof(struct RString, as.embed.ary);
197}
198
199bool
200rb_str_reembeddable_p(VALUE str)
201{
202 return !FL_TEST(str, STR_NOFREE|STR_SHARED_ROOT|STR_SHARED);
203}
204
205static inline size_t
206rb_str_embed_size(long capa)
207{
208 return offsetof(struct RString, as.embed.ary) + capa;
209}
210
211size_t
212rb_str_size_as_embedded(VALUE str)
213{
214 size_t real_size;
215 if (STR_EMBED_P(str)) {
216 real_size = rb_str_embed_size(RSTRING(str)->len) + TERM_LEN(str);
217 }
218 /* if the string is not currently embedded, but it can be embedded, how
219 * much space would it require */
220 else if (rb_str_reembeddable_p(str)) {
221 real_size = rb_str_embed_size(RSTRING(str)->as.heap.aux.capa) + TERM_LEN(str);
222 }
223 else {
224 real_size = sizeof(struct RString);
225 }
226 return real_size;
227}
228
229static inline bool
230STR_EMBEDDABLE_P(long len, long termlen)
231{
232 return rb_gc_size_allocatable_p(rb_str_embed_size(len + termlen));
233}
234
235static VALUE str_replace_shared_without_enc(VALUE str2, VALUE str);
236static VALUE str_new_frozen(VALUE klass, VALUE orig);
237static VALUE str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding);
238static VALUE str_new_static(VALUE klass, const char *ptr, long len, int encindex);
239static VALUE str_new(VALUE klass, const char *ptr, long len);
240static void str_make_independent_expand(VALUE str, long len, long expand, const int termlen);
241static inline void str_modifiable(VALUE str);
242static VALUE rb_str_downcase(int argc, VALUE *argv, VALUE str);
243
244static inline void
245str_make_independent(VALUE str)
246{
247 long len = RSTRING_LEN(str);
248 int termlen = TERM_LEN(str);
249 str_make_independent_expand((str), len, 0L, termlen);
250}
251
252static inline int str_dependent_p(VALUE str);
253
254void
255rb_str_make_independent(VALUE str)
256{
257 if (str_dependent_p(str)) {
258 str_make_independent(str);
259 }
260}
261
262void
263rb_str_make_embedded(VALUE str)
264{
265 RUBY_ASSERT(rb_str_reembeddable_p(str));
266 RUBY_ASSERT(!STR_EMBED_P(str));
267
268 char *buf = RSTRING(str)->as.heap.ptr;
269 long len = RSTRING(str)->len;
270
271 STR_SET_EMBED(str);
272 STR_SET_LEN(str, len);
273
274 if (len > 0) {
275 memcpy(RSTRING_PTR(str), buf, len);
276 ruby_xfree(buf);
277 }
278
279 TERM_FILL(RSTRING(str)->as.embed.ary + len, TERM_LEN(str));
280}
281
282void
283rb_debug_rstring_null_ptr(const char *func)
284{
285 fprintf(stderr, "%s is returning NULL!! "
286 "SIGSEGV is highly expected to follow immediately.\n"
287 "If you could reproduce, attach your debugger here, "
288 "and look at the passed string.\n",
289 func);
290}
291
292/* symbols for [up|down|swap]case/capitalize options */
293static VALUE sym_ascii, sym_turkic, sym_lithuanian, sym_fold;
294
295static rb_encoding *
296get_encoding(VALUE str)
297{
298 return rb_enc_from_index(ENCODING_GET(str));
299}
300
301static void
302mustnot_broken(VALUE str)
303{
304 if (is_broken_string(str)) {
305 rb_raise(rb_eArgError, "invalid byte sequence in %s", rb_enc_name(STR_ENC_GET(str)));
306 }
307}
308
309static void
310mustnot_wchar(VALUE str)
311{
312 rb_encoding *enc = STR_ENC_GET(str);
313 if (rb_enc_mbminlen(enc) > 1) {
314 rb_raise(rb_eArgError, "wide char encoding: %s", rb_enc_name(enc));
315 }
316}
317
318static int fstring_cmp(VALUE a, VALUE b);
319
320static VALUE register_fstring(VALUE str, bool copy);
321
322const struct st_hash_type rb_fstring_hash_type = {
323 fstring_cmp,
325};
326
327#define BARE_STRING_P(str) (!FL_ANY_RAW(str, FL_EXIVAR) && RBASIC_CLASS(str) == rb_cString)
328
330 VALUE fstr;
331 bool copy;
332};
333
334static int
335fstr_update_callback(st_data_t *key, st_data_t *value, st_data_t data, int existing)
336{
337
338 struct fstr_update_arg *arg = (struct fstr_update_arg *)data;
339 VALUE str = (VALUE)*key;
340
341 if (existing) {
342 /* because of lazy sweep, str may be unmarked already and swept
343 * at next time */
344
345 if (rb_objspace_garbage_object_p(str)) {
346 arg->fstr = Qundef;
347 return ST_DELETE;
348 }
349
350 arg->fstr = str;
351 return ST_STOP;
352 }
353 else {
354 if (FL_TEST_RAW(str, STR_FAKESTR)) {
355 if (arg->copy) {
356 VALUE new_str = str_new(rb_cString, RSTRING(str)->as.heap.ptr, RSTRING(str)->len);
357 rb_enc_copy(new_str, str);
358 str = new_str;
359 }
360 else {
361 str = str_new_static(rb_cString, RSTRING(str)->as.heap.ptr,
362 RSTRING(str)->len,
363 ENCODING_GET(str));
364 }
365 OBJ_FREEZE_RAW(str);
366 }
367 else {
368 if (!OBJ_FROZEN(str))
369 str = str_new_frozen(rb_cString, str);
370 if (STR_SHARED_P(str)) { /* str should not be shared */
371 /* shared substring */
372 str_make_independent(str);
373 assert(OBJ_FROZEN(str));
374 }
375 if (!BARE_STRING_P(str)) {
376 str = str_new_frozen(rb_cString, str);
377 }
378 }
379 RBASIC(str)->flags |= RSTRING_FSTR;
380
381 *key = *value = arg->fstr = str;
382 return ST_CONTINUE;
383 }
384}
385
386RUBY_FUNC_EXPORTED
387VALUE
388rb_fstring(VALUE str)
389{
390 VALUE fstr;
391 int bare;
392
393 Check_Type(str, T_STRING);
394
395 if (FL_TEST(str, RSTRING_FSTR))
396 return str;
397
398 bare = BARE_STRING_P(str);
399 if (!bare) {
400 if (STR_EMBED_P(str)) {
401 OBJ_FREEZE_RAW(str);
402 return str;
403 }
404
405 if (FL_TEST_RAW(str, STR_SHARED_ROOT | STR_SHARED) == STR_SHARED_ROOT) {
406 assert(OBJ_FROZEN(str));
407 return str;
408 }
409 }
410
411 if (!FL_TEST_RAW(str, FL_FREEZE | STR_NOFREE))
412 rb_str_resize(str, RSTRING_LEN(str));
413
414 fstr = register_fstring(str, FALSE);
415
416 if (!bare) {
417 str_replace_shared_without_enc(str, fstr);
418 OBJ_FREEZE_RAW(str);
419 return str;
420 }
421 return fstr;
422}
423
424static VALUE
425register_fstring(VALUE str, bool copy)
426{
427 struct fstr_update_arg args;
428 args.copy = copy;
429
430 RB_VM_LOCK_ENTER();
431 {
432 st_table *frozen_strings = rb_vm_fstring_table();
433 do {
434 args.fstr = str;
435 st_update(frozen_strings, (st_data_t)str, fstr_update_callback, (st_data_t)&args);
436 } while (UNDEF_P(args.fstr));
437 }
438 RB_VM_LOCK_LEAVE();
439
440 assert(OBJ_FROZEN(args.fstr));
441 assert(!FL_TEST_RAW(args.fstr, STR_FAKESTR));
442 assert(!FL_TEST_RAW(args.fstr, FL_EXIVAR));
443 assert(RBASIC_CLASS(args.fstr) == rb_cString);
444 return args.fstr;
445}
446
447static VALUE
448setup_fake_str(struct RString *fake_str, const char *name, long len, int encidx)
449{
450 fake_str->basic.flags = T_STRING|RSTRING_NOEMBED|STR_NOFREE|STR_FAKESTR;
451 /* SHARED to be allocated by the callback */
452
453 if (!name) {
455 name = "";
456 }
457
458 ENCODING_SET_INLINED((VALUE)fake_str, encidx);
459
460 RBASIC_SET_CLASS_RAW((VALUE)fake_str, rb_cString);
461 fake_str->len = len;
462 fake_str->as.heap.ptr = (char *)name;
463 fake_str->as.heap.aux.capa = len;
464 return (VALUE)fake_str;
465}
466
467/*
468 * set up a fake string which refers a static string literal.
469 */
470VALUE
471rb_setup_fake_str(struct RString *fake_str, const char *name, long len, rb_encoding *enc)
472{
473 return setup_fake_str(fake_str, name, len, rb_enc_to_index(enc));
474}
475
476/*
477 * rb_fstring_new and rb_fstring_cstr family create or lookup a frozen
478 * shared string which refers a static string literal. `ptr` must
479 * point a constant string.
480 */
481VALUE
482rb_fstring_new(const char *ptr, long len)
483{
484 struct RString fake_str;
485 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), FALSE);
486}
487
488VALUE
489rb_fstring_enc_new(const char *ptr, long len, rb_encoding *enc)
490{
491 struct RString fake_str;
492 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), FALSE);
493}
494
495VALUE
496rb_fstring_cstr(const char *ptr)
497{
498 return rb_fstring_new(ptr, strlen(ptr));
499}
500
501static int
502fstring_set_class_i(st_data_t key, st_data_t val, st_data_t arg)
503{
504 RBASIC_SET_CLASS((VALUE)key, (VALUE)arg);
505 return ST_CONTINUE;
506}
507
508static int
509fstring_cmp(VALUE a, VALUE b)
510{
511 long alen, blen;
512 const char *aptr, *bptr;
513 RSTRING_GETMEM(a, aptr, alen);
514 RSTRING_GETMEM(b, bptr, blen);
515 return (alen != blen ||
516 ENCODING_GET(a) != ENCODING_GET(b) ||
517 memcmp(aptr, bptr, alen) != 0);
518}
519
520static inline int
521single_byte_optimizable(VALUE str)
522{
523 rb_encoding *enc;
524
525 /* Conservative. It may be ENC_CODERANGE_UNKNOWN. */
527 return 1;
528
529 enc = STR_ENC_GET(str);
530 if (rb_enc_mbmaxlen(enc) == 1)
531 return 1;
532
533 /* Conservative. Possibly single byte.
534 * "\xa1" in Shift_JIS for example. */
535 return 0;
536}
537
539
540static inline const char *
541search_nonascii(const char *p, const char *e)
542{
543 const uintptr_t *s, *t;
544
545#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
546# if SIZEOF_UINTPTR_T == 8
547# define NONASCII_MASK UINT64_C(0x8080808080808080)
548# elif SIZEOF_UINTPTR_T == 4
549# define NONASCII_MASK UINT32_C(0x80808080)
550# else
551# error "don't know what to do."
552# endif
553#else
554# if SIZEOF_UINTPTR_T == 8
555# define NONASCII_MASK ((uintptr_t)0x80808080UL << 32 | (uintptr_t)0x80808080UL)
556# elif SIZEOF_UINTPTR_T == 4
557# define NONASCII_MASK 0x80808080UL /* or...? */
558# else
559# error "don't know what to do."
560# endif
561#endif
562
563 if (UNALIGNED_WORD_ACCESS || e - p >= SIZEOF_VOIDP) {
564#if !UNALIGNED_WORD_ACCESS
565 if ((uintptr_t)p % SIZEOF_VOIDP) {
566 int l = SIZEOF_VOIDP - (uintptr_t)p % SIZEOF_VOIDP;
567 p += l;
568 switch (l) {
569 default: UNREACHABLE;
570#if SIZEOF_VOIDP > 4
571 case 7: if (p[-7]&0x80) return p-7;
572 case 6: if (p[-6]&0x80) return p-6;
573 case 5: if (p[-5]&0x80) return p-5;
574 case 4: if (p[-4]&0x80) return p-4;
575#endif
576 case 3: if (p[-3]&0x80) return p-3;
577 case 2: if (p[-2]&0x80) return p-2;
578 case 1: if (p[-1]&0x80) return p-1;
579 case 0: break;
580 }
581 }
582#endif
583#if defined(HAVE_BUILTIN___BUILTIN_ASSUME_ALIGNED) &&! UNALIGNED_WORD_ACCESS
584#define aligned_ptr(value) \
585 __builtin_assume_aligned((value), sizeof(uintptr_t))
586#else
587#define aligned_ptr(value) (uintptr_t *)(value)
588#endif
589 s = aligned_ptr(p);
590 t = (uintptr_t *)(e - (SIZEOF_VOIDP-1));
591#undef aligned_ptr
592 for (;s < t; s++) {
593 if (*s & NONASCII_MASK) {
594#ifdef WORDS_BIGENDIAN
595 return (const char *)s + (nlz_intptr(*s&NONASCII_MASK)>>3);
596#else
597 return (const char *)s + (ntz_intptr(*s&NONASCII_MASK)>>3);
598#endif
599 }
600 }
601 p = (const char *)s;
602 }
603
604 switch (e - p) {
605 default: UNREACHABLE;
606#if SIZEOF_VOIDP > 4
607 case 7: if (e[-7]&0x80) return e-7;
608 case 6: if (e[-6]&0x80) return e-6;
609 case 5: if (e[-5]&0x80) return e-5;
610 case 4: if (e[-4]&0x80) return e-4;
611#endif
612 case 3: if (e[-3]&0x80) return e-3;
613 case 2: if (e[-2]&0x80) return e-2;
614 case 1: if (e[-1]&0x80) return e-1;
615 case 0: return NULL;
616 }
617}
618
619static int
620coderange_scan(const char *p, long len, rb_encoding *enc)
621{
622 const char *e = p + len;
623
624 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
625 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
626 p = search_nonascii(p, e);
628 }
629
630 if (rb_enc_asciicompat(enc)) {
631 p = search_nonascii(p, e);
632 if (!p) return ENC_CODERANGE_7BIT;
633 for (;;) {
634 int ret = rb_enc_precise_mbclen(p, e, enc);
636 p += MBCLEN_CHARFOUND_LEN(ret);
637 if (p == e) break;
638 p = search_nonascii(p, e);
639 if (!p) break;
640 }
641 }
642 else {
643 while (p < e) {
644 int ret = rb_enc_precise_mbclen(p, e, enc);
646 p += MBCLEN_CHARFOUND_LEN(ret);
647 }
648 }
649 return ENC_CODERANGE_VALID;
650}
651
652long
653rb_str_coderange_scan_restartable(const char *s, const char *e, rb_encoding *enc, int *cr)
654{
655 const char *p = s;
656
657 if (*cr == ENC_CODERANGE_BROKEN)
658 return e - s;
659
660 if (rb_enc_to_index(enc) == rb_ascii8bit_encindex()) {
661 /* enc is ASCII-8BIT. ASCII-8BIT string never be broken. */
662 if (*cr == ENC_CODERANGE_VALID) return e - s;
663 p = search_nonascii(p, e);
665 return e - s;
666 }
667 else if (rb_enc_asciicompat(enc)) {
668 p = search_nonascii(p, e);
669 if (!p) {
670 if (*cr != ENC_CODERANGE_VALID) *cr = ENC_CODERANGE_7BIT;
671 return e - s;
672 }
673 for (;;) {
674 int ret = rb_enc_precise_mbclen(p, e, enc);
675 if (!MBCLEN_CHARFOUND_P(ret)) {
677 return p - s;
678 }
679 p += MBCLEN_CHARFOUND_LEN(ret);
680 if (p == e) break;
681 p = search_nonascii(p, e);
682 if (!p) break;
683 }
684 }
685 else {
686 while (p < e) {
687 int ret = rb_enc_precise_mbclen(p, e, enc);
688 if (!MBCLEN_CHARFOUND_P(ret)) {
690 return p - s;
691 }
692 p += MBCLEN_CHARFOUND_LEN(ret);
693 }
694 }
696 return e - s;
697}
698
699static inline void
700str_enc_copy(VALUE str1, VALUE str2)
701{
702 rb_enc_set_index(str1, ENCODING_GET(str2));
703}
704
705/* Like str_enc_copy, but does not check frozen status of str1.
706 * You should use this only if you're certain that str1 is not frozen. */
707static inline void
708str_enc_copy_direct(VALUE str1, VALUE str2)
709{
710 int inlined_encoding = RB_ENCODING_GET_INLINED(str2);
711 if (inlined_encoding == ENCODING_INLINE_MAX) {
712 rb_enc_set_index(str1, rb_enc_get_index(str2));
713 }
714 else {
715 ENCODING_SET_INLINED(str1, inlined_encoding);
716 }
717}
718
719static void
720rb_enc_cr_str_copy_for_substr(VALUE dest, VALUE src)
721{
722 /* this function is designed for copying encoding and coderange
723 * from src to new string "dest" which is made from the part of src.
724 */
725 str_enc_copy(dest, src);
726 if (RSTRING_LEN(dest) == 0) {
727 if (!rb_enc_asciicompat(STR_ENC_GET(src)))
729 else
731 return;
732 }
733 switch (ENC_CODERANGE(src)) {
736 break;
738 if (!rb_enc_asciicompat(STR_ENC_GET(src)) ||
739 search_nonascii(RSTRING_PTR(dest), RSTRING_END(dest)))
741 else
743 break;
744 default:
745 break;
746 }
747}
748
749static void
750rb_enc_cr_str_exact_copy(VALUE dest, VALUE src)
751{
752 str_enc_copy(dest, src);
754}
755
756static int
757enc_coderange_scan(VALUE str, rb_encoding *enc)
758{
759 return coderange_scan(RSTRING_PTR(str), RSTRING_LEN(str), enc);
760}
761
762int
763rb_enc_str_coderange_scan(VALUE str, rb_encoding *enc)
764{
765 return enc_coderange_scan(str, enc);
766}
767
768int
770{
771 int cr = ENC_CODERANGE(str);
772
773 if (cr == ENC_CODERANGE_UNKNOWN) {
774 cr = enc_coderange_scan(str, get_encoding(str));
775 ENC_CODERANGE_SET(str, cr);
776 }
777 return cr;
778}
779
780int
782{
783 rb_encoding *enc = STR_ENC_GET(str);
784
785 if (!rb_enc_asciicompat(enc))
786 return FALSE;
787 else if (is_ascii_string(str))
788 return TRUE;
789 return FALSE;
790}
791
792static inline void
793str_mod_check(VALUE s, const char *p, long len)
794{
795 if (RSTRING_PTR(s) != p || RSTRING_LEN(s) != len){
796 rb_raise(rb_eRuntimeError, "string modified");
797 }
798}
799
800static size_t
801str_capacity(VALUE str, const int termlen)
802{
803 if (STR_EMBED_P(str)) {
804 return str_embed_capa(str) - termlen;
805 }
806 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
807 return RSTRING(str)->len;
808 }
809 else {
810 return RSTRING(str)->as.heap.aux.capa;
811 }
812}
813
814size_t
816{
817 return str_capacity(str, TERM_LEN(str));
818}
819
820static inline void
821must_not_null(const char *ptr)
822{
823 if (!ptr) {
824 rb_raise(rb_eArgError, "NULL pointer given");
825 }
826}
827
828static inline VALUE
829str_alloc_embed(VALUE klass, size_t capa)
830{
831 size_t size = rb_str_embed_size(capa);
832 assert(size > 0);
833 assert(rb_gc_size_allocatable_p(size));
834
835 NEWOBJ_OF(str, struct RString, klass,
837
838 return (VALUE)str;
839}
840
841static inline VALUE
842str_alloc_heap(VALUE klass)
843{
844 NEWOBJ_OF(str, struct RString, klass,
845 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), 0);
846
847 return (VALUE)str;
848}
849
850static inline VALUE
851empty_str_alloc(VALUE klass)
852{
853 RUBY_DTRACE_CREATE_HOOK(STRING, 0);
854 VALUE str = str_alloc_embed(klass, 0);
855 memset(RSTRING(str)->as.embed.ary, 0, str_embed_capa(str));
856 return str;
857}
858
859static VALUE
860str_new0(VALUE klass, const char *ptr, long len, int termlen)
861{
862 VALUE str;
863
864 if (len < 0) {
865 rb_raise(rb_eArgError, "negative string size (or size too big)");
866 }
867
868 RUBY_DTRACE_CREATE_HOOK(STRING, len);
869
870 if (STR_EMBEDDABLE_P(len, termlen)) {
871 str = str_alloc_embed(klass, len + termlen);
872 if (len == 0) {
874 }
875 }
876 else {
877 str = str_alloc_heap(klass);
878 RSTRING(str)->as.heap.aux.capa = len;
879 /* :FIXME: @shyouhei guesses `len + termlen` is guaranteed to never
880 * integer overflow. If we can STATIC_ASSERT that, the following
881 * mul_add_mul can be reverted to a simple ALLOC_N. */
882 RSTRING(str)->as.heap.ptr =
883 rb_xmalloc_mul_add_mul(sizeof(char), len, sizeof(char), termlen);
884 }
885 if (ptr) {
886 memcpy(RSTRING_PTR(str), ptr, len);
887 }
888 STR_SET_LEN(str, len);
889 TERM_FILL(RSTRING_PTR(str) + len, termlen);
890 return str;
891}
892
893static VALUE
894str_new(VALUE klass, const char *ptr, long len)
895{
896 return str_new0(klass, ptr, len, 1);
897}
898
899VALUE
900rb_str_new(const char *ptr, long len)
901{
902 return str_new(rb_cString, ptr, len);
903}
904
905VALUE
906rb_usascii_str_new(const char *ptr, long len)
907{
908 VALUE str = rb_str_new(ptr, len);
909 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
910 return str;
911}
912
913VALUE
914rb_utf8_str_new(const char *ptr, long len)
915{
916 VALUE str = str_new(rb_cString, ptr, len);
917 rb_enc_associate_index(str, rb_utf8_encindex());
918 return str;
919}
920
921VALUE
922rb_enc_str_new(const char *ptr, long len, rb_encoding *enc)
923{
924 VALUE str;
925
926 if (!enc) return rb_str_new(ptr, len);
927
928 str = str_new0(rb_cString, ptr, len, rb_enc_mbminlen(enc));
929 rb_enc_associate(str, enc);
930 return str;
931}
932
933VALUE
935{
936 must_not_null(ptr);
937 /* rb_str_new_cstr() can take pointer from non-malloc-generated
938 * memory regions, and that cannot be detected by the MSAN. Just
939 * trust the programmer that the argument passed here is a sane C
940 * string. */
941 __msan_unpoison_string(ptr);
942 return rb_str_new(ptr, strlen(ptr));
943}
944
945VALUE
947{
949 ENCODING_CODERANGE_SET(str, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
950 return str;
951}
952
953VALUE
955{
957 rb_enc_associate_index(str, rb_utf8_encindex());
958 return str;
959}
960
961VALUE
963{
964 must_not_null(ptr);
965 if (rb_enc_mbminlen(enc) != 1) {
966 rb_raise(rb_eArgError, "wchar encoding given");
967 }
968 return rb_enc_str_new(ptr, strlen(ptr), enc);
969}
970
971static VALUE
972str_new_static(VALUE klass, const char *ptr, long len, int encindex)
973{
974 VALUE str;
975
976 if (len < 0) {
977 rb_raise(rb_eArgError, "negative string size (or size too big)");
978 }
979
980 if (!ptr) {
981 rb_encoding *enc = rb_enc_get_from_index(encindex);
982 str = str_new0(klass, ptr, len, rb_enc_mbminlen(enc));
983 }
984 else {
985 RUBY_DTRACE_CREATE_HOOK(STRING, len);
986 str = str_alloc_heap(klass);
987 RSTRING(str)->len = len;
988 RSTRING(str)->as.heap.ptr = (char *)ptr;
989 RSTRING(str)->as.heap.aux.capa = len;
990 RBASIC(str)->flags |= STR_NOFREE;
991 }
992 rb_enc_associate_index(str, encindex);
993 return str;
994}
995
996VALUE
997rb_str_new_static(const char *ptr, long len)
998{
999 return str_new_static(rb_cString, ptr, len, 0);
1000}
1001
1002VALUE
1004{
1005 return str_new_static(rb_cString, ptr, len, ENCINDEX_US_ASCII);
1006}
1007
1008VALUE
1010{
1011 return str_new_static(rb_cString, ptr, len, ENCINDEX_UTF_8);
1012}
1013
1014VALUE
1016{
1017 return str_new_static(rb_cString, ptr, len, rb_enc_to_index(enc));
1018}
1019
1020static VALUE str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1021 rb_encoding *from, rb_encoding *to,
1022 int ecflags, VALUE ecopts);
1023
1024static inline bool
1025is_enc_ascii_string(VALUE str, rb_encoding *enc)
1026{
1027 int encidx = rb_enc_to_index(enc);
1028 if (rb_enc_get_index(str) == encidx)
1029 return is_ascii_string(str);
1030 return enc_coderange_scan(str, enc) == ENC_CODERANGE_7BIT;
1031}
1032
1033VALUE
1034rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
1035{
1036 long len;
1037 const char *ptr;
1038 VALUE newstr;
1039
1040 if (!to) return str;
1041 if (!from) from = rb_enc_get(str);
1042 if (from == to) return str;
1043 if ((rb_enc_asciicompat(to) && is_enc_ascii_string(str, from)) ||
1044 rb_is_ascii8bit_enc(to)) {
1045 if (STR_ENC_GET(str) != to) {
1046 str = rb_str_dup(str);
1047 rb_enc_associate(str, to);
1048 }
1049 return str;
1050 }
1051
1052 RSTRING_GETMEM(str, ptr, len);
1053 newstr = str_cat_conv_enc_opts(rb_str_buf_new(len), 0, ptr, len,
1054 from, to, ecflags, ecopts);
1055 if (NIL_P(newstr)) {
1056 /* some error, return original */
1057 return str;
1058 }
1059 return newstr;
1060}
1061
1062VALUE
1063rb_str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1064 rb_encoding *from, int ecflags, VALUE ecopts)
1065{
1066 long olen;
1067
1068 olen = RSTRING_LEN(newstr);
1069 if (ofs < -olen || olen < ofs)
1070 rb_raise(rb_eIndexError, "index %ld out of string", ofs);
1071 if (ofs < 0) ofs += olen;
1072 if (!from) {
1073 STR_SET_LEN(newstr, ofs);
1074 return rb_str_cat(newstr, ptr, len);
1075 }
1076
1077 rb_str_modify(newstr);
1078 return str_cat_conv_enc_opts(newstr, ofs, ptr, len, from,
1079 rb_enc_get(newstr),
1080 ecflags, ecopts);
1081}
1082
1083VALUE
1084rb_str_initialize(VALUE str, const char *ptr, long len, rb_encoding *enc)
1085{
1086 STR_SET_LEN(str, 0);
1087 rb_enc_associate(str, enc);
1088 rb_str_cat(str, ptr, len);
1089 return str;
1090}
1091
1092static VALUE
1093str_cat_conv_enc_opts(VALUE newstr, long ofs, const char *ptr, long len,
1094 rb_encoding *from, rb_encoding *to,
1095 int ecflags, VALUE ecopts)
1096{
1097 rb_econv_t *ec;
1099 long olen;
1100 VALUE econv_wrapper;
1101 const unsigned char *start, *sp;
1102 unsigned char *dest, *dp;
1103 size_t converted_output = (size_t)ofs;
1104
1105 olen = rb_str_capacity(newstr);
1106
1107 econv_wrapper = rb_obj_alloc(rb_cEncodingConverter);
1108 RBASIC_CLEAR_CLASS(econv_wrapper);
1109 ec = rb_econv_open_opts(from->name, to->name, ecflags, ecopts);
1110 if (!ec) return Qnil;
1111 DATA_PTR(econv_wrapper) = ec;
1112
1113 sp = (unsigned char*)ptr;
1114 start = sp;
1115 while ((dest = (unsigned char*)RSTRING_PTR(newstr)),
1116 (dp = dest + converted_output),
1117 (ret = rb_econv_convert(ec, &sp, start + len, &dp, dest + olen, 0)),
1119 /* destination buffer short */
1120 size_t converted_input = sp - start;
1121 size_t rest = len - converted_input;
1122 converted_output = dp - dest;
1123 rb_str_set_len(newstr, converted_output);
1124 if (converted_input && converted_output &&
1125 rest < (LONG_MAX / converted_output)) {
1126 rest = (rest * converted_output) / converted_input;
1127 }
1128 else {
1129 rest = olen;
1130 }
1131 olen += rest < 2 ? 2 : rest;
1132 rb_str_resize(newstr, olen);
1133 }
1134 DATA_PTR(econv_wrapper) = 0;
1135 rb_econv_close(ec);
1136 switch (ret) {
1137 case econv_finished:
1138 len = dp - (unsigned char*)RSTRING_PTR(newstr);
1139 rb_str_set_len(newstr, len);
1140 rb_enc_associate(newstr, to);
1141 return newstr;
1142
1143 default:
1144 return Qnil;
1145 }
1146}
1147
1148VALUE
1150{
1151 return rb_str_conv_enc_opts(str, from, to, 0, Qnil);
1152}
1153
1154VALUE
1156{
1157 rb_encoding *ienc;
1158 VALUE str;
1159 const int eidx = rb_enc_to_index(eenc);
1160
1161 if (!ptr) {
1162 return rb_enc_str_new(ptr, len, eenc);
1163 }
1164
1165 /* ASCII-8BIT case, no conversion */
1166 if ((eidx == rb_ascii8bit_encindex()) ||
1167 (eidx == rb_usascii_encindex() && search_nonascii(ptr, ptr + len))) {
1168 return rb_str_new(ptr, len);
1169 }
1170 /* no default_internal or same encoding, no conversion */
1171 ienc = rb_default_internal_encoding();
1172 if (!ienc || eenc == ienc) {
1173 return rb_enc_str_new(ptr, len, eenc);
1174 }
1175 /* ASCII compatible, and ASCII only string, no conversion in
1176 * default_internal */
1177 if ((eidx == rb_ascii8bit_encindex()) ||
1178 (eidx == rb_usascii_encindex()) ||
1179 (rb_enc_asciicompat(eenc) && !search_nonascii(ptr, ptr + len))) {
1180 return rb_enc_str_new(ptr, len, ienc);
1181 }
1182 /* convert from the given encoding to default_internal */
1183 str = rb_enc_str_new(NULL, 0, ienc);
1184 /* when the conversion failed for some reason, just ignore the
1185 * default_internal and result in the given encoding as-is. */
1186 if (NIL_P(rb_str_cat_conv_enc_opts(str, 0, ptr, len, eenc, 0, Qnil))) {
1187 rb_str_initialize(str, ptr, len, eenc);
1188 }
1189 return str;
1190}
1191
1192VALUE
1193rb_external_str_with_enc(VALUE str, rb_encoding *eenc)
1194{
1195 int eidx = rb_enc_to_index(eenc);
1196 if (eidx == rb_usascii_encindex() &&
1197 !is_ascii_string(str)) {
1198 rb_enc_associate_index(str, rb_ascii8bit_encindex());
1199 return str;
1200 }
1201 rb_enc_associate_index(str, eidx);
1202 return rb_str_conv_enc(str, eenc, rb_default_internal_encoding());
1203}
1204
1205VALUE
1206rb_external_str_new(const char *ptr, long len)
1207{
1208 return rb_external_str_new_with_enc(ptr, len, rb_default_external_encoding());
1209}
1210
1211VALUE
1213{
1214 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_default_external_encoding());
1215}
1216
1217VALUE
1218rb_locale_str_new(const char *ptr, long len)
1219{
1220 return rb_external_str_new_with_enc(ptr, len, rb_locale_encoding());
1221}
1222
1223VALUE
1225{
1226 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_locale_encoding());
1227}
1228
1229VALUE
1231{
1232 return rb_external_str_new_with_enc(ptr, len, rb_filesystem_encoding());
1233}
1234
1235VALUE
1236rb_filesystem_str_new_cstr(const char *ptr)
1237{
1238 return rb_external_str_new_with_enc(ptr, strlen(ptr), rb_filesystem_encoding());
1239}
1240
1241VALUE
1243{
1244 return rb_str_export_to_enc(str, rb_default_external_encoding());
1245}
1246
1247VALUE
1249{
1250 return rb_str_export_to_enc(str, rb_locale_encoding());
1251}
1252
1253VALUE
1255{
1256 return rb_str_conv_enc(str, STR_ENC_GET(str), enc);
1257}
1258
1259static VALUE
1260str_replace_shared_without_enc(VALUE str2, VALUE str)
1261{
1262 const int termlen = TERM_LEN(str);
1263 char *ptr;
1264 long len;
1265
1266 RSTRING_GETMEM(str, ptr, len);
1267 if (str_embed_capa(str2) >= len + termlen) {
1268 char *ptr2 = RSTRING(str2)->as.embed.ary;
1269 STR_SET_EMBED(str2);
1270 memcpy(ptr2, RSTRING_PTR(str), len);
1271 TERM_FILL(ptr2+len, termlen);
1272 }
1273 else {
1274 VALUE root;
1275 if (STR_SHARED_P(str)) {
1276 root = RSTRING(str)->as.heap.aux.shared;
1277 RSTRING_GETMEM(str, ptr, len);
1278 }
1279 else {
1280 root = rb_str_new_frozen(str);
1281 RSTRING_GETMEM(root, ptr, len);
1282 }
1283 assert(OBJ_FROZEN(root));
1284 if (!STR_EMBED_P(str2) && !FL_TEST_RAW(str2, STR_SHARED|STR_NOFREE)) {
1285 if (FL_TEST_RAW(str2, STR_SHARED_ROOT)) {
1286 rb_fatal("about to free a possible shared root");
1287 }
1288 char *ptr2 = STR_HEAP_PTR(str2);
1289 if (ptr2 != ptr) {
1290 ruby_sized_xfree(ptr2, STR_HEAP_SIZE(str2));
1291 }
1292 }
1293 FL_SET(str2, STR_NOEMBED);
1294 RSTRING(str2)->as.heap.ptr = ptr;
1295 STR_SET_SHARED(str2, root);
1296 }
1297
1298 STR_SET_LEN(str2, len);
1299
1300 return str2;
1301}
1302
1303static VALUE
1304str_replace_shared(VALUE str2, VALUE str)
1305{
1306 str_replace_shared_without_enc(str2, str);
1307 rb_enc_cr_str_exact_copy(str2, str);
1308 return str2;
1309}
1310
1311static VALUE
1312str_new_shared(VALUE klass, VALUE str)
1313{
1314 return str_replace_shared(str_alloc_heap(klass), str);
1315}
1316
1317VALUE
1319{
1320 return str_new_shared(rb_obj_class(str), str);
1321}
1322
1323VALUE
1324rb_str_new_frozen(VALUE orig)
1325{
1326 if (OBJ_FROZEN(orig)) return orig;
1327 return str_new_frozen(rb_obj_class(orig), orig);
1328}
1329
1330static VALUE
1331rb_str_new_frozen_String(VALUE orig)
1332{
1333 if (OBJ_FROZEN(orig) && rb_obj_class(orig) == rb_cString) return orig;
1334 return str_new_frozen(rb_cString, orig);
1335}
1336
1337VALUE
1338rb_str_tmp_frozen_acquire(VALUE orig)
1339{
1340 if (OBJ_FROZEN_RAW(orig)) return orig;
1341 return str_new_frozen_buffer(0, orig, FALSE);
1342}
1343
1344void
1345rb_str_tmp_frozen_release(VALUE orig, VALUE tmp)
1346{
1347 if (RBASIC_CLASS(tmp) != 0)
1348 return;
1349
1350 if (STR_EMBED_P(tmp)) {
1351 assert(OBJ_FROZEN_RAW(tmp));
1352 }
1353 else if (FL_TEST_RAW(orig, STR_SHARED) &&
1354 !FL_TEST_RAW(orig, STR_TMPLOCK|RUBY_FL_FREEZE)) {
1355 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1356
1357 if (shared == tmp && !FL_TEST_RAW(tmp, STR_BORROWED)) {
1358 assert(RSTRING(orig)->as.heap.ptr == RSTRING(tmp)->as.heap.ptr);
1359 assert(RSTRING_LEN(orig) == RSTRING_LEN(tmp));
1360
1361 /* Unshare orig since the root (tmp) only has this one child. */
1362 FL_UNSET_RAW(orig, STR_SHARED);
1363 RSTRING(orig)->as.heap.aux.capa = RSTRING(tmp)->as.heap.aux.capa;
1364 RBASIC(orig)->flags |= RBASIC(tmp)->flags & STR_NOFREE;
1365 assert(OBJ_FROZEN_RAW(tmp));
1366
1367 /* Make tmp embedded and empty so it is safe for sweeping. */
1368 STR_SET_EMBED(tmp);
1369 STR_SET_LEN(tmp, 0);
1370 }
1371 }
1372}
1373
1374static VALUE
1375str_new_frozen(VALUE klass, VALUE orig)
1376{
1377 return str_new_frozen_buffer(klass, orig, TRUE);
1378}
1379
1380static VALUE
1381heap_str_make_shared(VALUE klass, VALUE orig)
1382{
1383 assert(!STR_EMBED_P(orig));
1384 assert(!STR_SHARED_P(orig));
1385
1386 VALUE str = str_alloc_heap(klass);
1387 STR_SET_LEN(str, RSTRING_LEN(orig));
1388 RSTRING(str)->as.heap.ptr = RSTRING_PTR(orig);
1389 RSTRING(str)->as.heap.aux.capa = RSTRING(orig)->as.heap.aux.capa;
1390 RBASIC(str)->flags |= RBASIC(orig)->flags & STR_NOFREE;
1391 RBASIC(orig)->flags &= ~STR_NOFREE;
1392 STR_SET_SHARED(orig, str);
1393 if (klass == 0)
1394 FL_UNSET_RAW(str, STR_BORROWED);
1395 return str;
1396}
1397
1398static VALUE
1399str_new_frozen_buffer(VALUE klass, VALUE orig, int copy_encoding)
1400{
1401 VALUE str;
1402
1403 long len = RSTRING_LEN(orig);
1404 int termlen = copy_encoding ? TERM_LEN(orig) : 1;
1405
1406 if (STR_EMBED_P(orig) || STR_EMBEDDABLE_P(len, termlen)) {
1407 str = str_new0(klass, RSTRING_PTR(orig), len, termlen);
1408 assert(STR_EMBED_P(str));
1409 }
1410 else {
1411 if (FL_TEST_RAW(orig, STR_SHARED)) {
1412 VALUE shared = RSTRING(orig)->as.heap.aux.shared;
1413 long ofs = RSTRING(orig)->as.heap.ptr - RSTRING_PTR(shared);
1414 long rest = RSTRING_LEN(shared) - ofs - RSTRING_LEN(orig);
1415 assert(ofs >= 0);
1416 assert(rest >= 0);
1417 assert(ofs + rest <= RSTRING_LEN(shared));
1418 assert(OBJ_FROZEN(shared));
1419
1420 if ((ofs > 0) || (rest > 0) ||
1421 (klass != RBASIC(shared)->klass) ||
1422 ENCODING_GET(shared) != ENCODING_GET(orig)) {
1423 str = str_new_shared(klass, shared);
1424 assert(!STR_EMBED_P(str));
1425 RSTRING(str)->as.heap.ptr += ofs;
1426 STR_SET_LEN(str, RSTRING_LEN(str) - (ofs + rest));
1427 }
1428 else {
1429 if (RBASIC_CLASS(shared) == 0)
1430 FL_SET_RAW(shared, STR_BORROWED);
1431 return shared;
1432 }
1433 }
1434 else if (STR_EMBEDDABLE_P(RSTRING_LEN(orig), TERM_LEN(orig))) {
1435 str = str_alloc_embed(klass, RSTRING_LEN(orig) + TERM_LEN(orig));
1436 STR_SET_EMBED(str);
1437 memcpy(RSTRING_PTR(str), RSTRING_PTR(orig), RSTRING_LEN(orig));
1438 STR_SET_LEN(str, RSTRING_LEN(orig));
1439 TERM_FILL(RSTRING_END(str), TERM_LEN(orig));
1440 }
1441 else {
1442 str = heap_str_make_shared(klass, orig);
1443 }
1444 }
1445
1446 if (copy_encoding) rb_enc_cr_str_exact_copy(str, orig);
1447 OBJ_FREEZE(str);
1448 return str;
1449}
1450
1451VALUE
1452rb_str_new_with_class(VALUE obj, const char *ptr, long len)
1453{
1454 return str_new0(rb_obj_class(obj), ptr, len, TERM_LEN(obj));
1455}
1456
1457static VALUE
1458str_new_empty_String(VALUE str)
1459{
1460 VALUE v = rb_str_new(0, 0);
1461 rb_enc_copy(v, str);
1462 return v;
1463}
1464
1465#define STR_BUF_MIN_SIZE 63
1466
1467VALUE
1468rb_str_buf_new(long capa)
1469{
1470 if (STR_EMBEDDABLE_P(capa, 1)) {
1471 return str_alloc_embed(rb_cString, capa + 1);
1472 }
1473
1474 VALUE str = str_alloc_heap(rb_cString);
1475
1476 RSTRING(str)->as.heap.aux.capa = capa;
1477 RSTRING(str)->as.heap.ptr = ALLOC_N(char, (size_t)capa + 1);
1478 RSTRING(str)->as.heap.ptr[0] = '\0';
1479
1480 return str;
1481}
1482
1483VALUE
1485{
1486 VALUE str;
1487 long len = strlen(ptr);
1488
1489 str = rb_str_buf_new(len);
1490 rb_str_buf_cat(str, ptr, len);
1491
1492 return str;
1493}
1494
1495VALUE
1497{
1498 return str_new(0, 0, len);
1499}
1500
1501void
1503{
1504 if (FL_TEST(str, RSTRING_FSTR)) {
1505 st_data_t fstr = (st_data_t)str;
1506
1507 RB_VM_LOCK_ENTER();
1508 {
1509 st_delete(rb_vm_fstring_table(), &fstr, NULL);
1510 RB_DEBUG_COUNTER_INC(obj_str_fstr);
1511 }
1512 RB_VM_LOCK_LEAVE();
1513 }
1514
1515 if (STR_EMBED_P(str)) {
1516 RB_DEBUG_COUNTER_INC(obj_str_embed);
1517 }
1518 else if (FL_TEST(str, STR_SHARED | STR_NOFREE)) {
1519 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_SHARED));
1520 (void)RB_DEBUG_COUNTER_INC_IF(obj_str_shared, FL_TEST(str, STR_NOFREE));
1521 }
1522 else {
1523 RB_DEBUG_COUNTER_INC(obj_str_ptr);
1524 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
1525 }
1526}
1527
1528RUBY_FUNC_EXPORTED size_t
1529rb_str_memsize(VALUE str)
1530{
1531 if (FL_TEST(str, STR_NOEMBED|STR_SHARED|STR_NOFREE) == STR_NOEMBED) {
1532 return STR_HEAP_SIZE(str);
1533 }
1534 else {
1535 return 0;
1536 }
1537}
1538
1539VALUE
1541{
1542 return rb_convert_type_with_id(str, T_STRING, "String", idTo_str);
1543}
1544
1545static inline void str_discard(VALUE str);
1546static void str_shared_replace(VALUE str, VALUE str2);
1547
1548void
1550{
1551 if (str != str2) str_shared_replace(str, str2);
1552}
1553
1554static void
1555str_shared_replace(VALUE str, VALUE str2)
1556{
1557 rb_encoding *enc;
1558 int cr;
1559 int termlen;
1560
1561 RUBY_ASSERT(str2 != str);
1562 enc = STR_ENC_GET(str2);
1563 cr = ENC_CODERANGE(str2);
1564 str_discard(str);
1565 termlen = rb_enc_mbminlen(enc);
1566
1567 STR_SET_LEN(str, RSTRING_LEN(str2));
1568
1569 if (str_embed_capa(str) >= RSTRING_LEN(str2) + termlen) {
1570 STR_SET_EMBED(str);
1571 memcpy(RSTRING_PTR(str), RSTRING_PTR(str2), (size_t)RSTRING_LEN(str2) + termlen);
1572 rb_enc_associate(str, enc);
1573 ENC_CODERANGE_SET(str, cr);
1574 }
1575 else {
1576 if (STR_EMBED_P(str2)) {
1577 assert(!FL_TEST(str2, STR_SHARED));
1578 long len = RSTRING_LEN(str2);
1579 assert(len + termlen <= str_embed_capa(str2));
1580
1581 char *new_ptr = ALLOC_N(char, len + termlen);
1582 memcpy(new_ptr, RSTRING(str2)->as.embed.ary, len + termlen);
1583 RSTRING(str2)->as.heap.ptr = new_ptr;
1584 STR_SET_LEN(str2, len);
1585 RSTRING(str2)->as.heap.aux.capa = len;
1586 STR_SET_NOEMBED(str2);
1587 }
1588
1589 STR_SET_NOEMBED(str);
1590 FL_UNSET(str, STR_SHARED);
1591 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1592
1593 if (FL_TEST(str2, STR_SHARED)) {
1594 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1595 STR_SET_SHARED(str, shared);
1596 }
1597 else {
1598 RSTRING(str)->as.heap.aux.capa = RSTRING(str2)->as.heap.aux.capa;
1599 }
1600
1601 /* abandon str2 */
1602 STR_SET_EMBED(str2);
1603 RSTRING_PTR(str2)[0] = 0;
1604 STR_SET_LEN(str2, 0);
1605 rb_enc_associate(str, enc);
1606 ENC_CODERANGE_SET(str, cr);
1607 }
1608}
1609
1610VALUE
1611rb_obj_as_string(VALUE obj)
1612{
1613 VALUE str;
1614
1615 if (RB_TYPE_P(obj, T_STRING)) {
1616 return obj;
1617 }
1618 str = rb_funcall(obj, idTo_s, 0);
1619 return rb_obj_as_string_result(str, obj);
1620}
1621
1622VALUE
1623rb_obj_as_string_result(VALUE str, VALUE obj)
1624{
1625 if (!RB_TYPE_P(str, T_STRING))
1626 return rb_any_to_s(obj);
1627 return str;
1628}
1629
1630static VALUE
1631str_replace(VALUE str, VALUE str2)
1632{
1633 long len;
1634
1635 len = RSTRING_LEN(str2);
1636 if (STR_SHARED_P(str2)) {
1637 VALUE shared = RSTRING(str2)->as.heap.aux.shared;
1638 assert(OBJ_FROZEN(shared));
1639 STR_SET_NOEMBED(str);
1640 STR_SET_LEN(str, len);
1641 RSTRING(str)->as.heap.ptr = RSTRING_PTR(str2);
1642 STR_SET_SHARED(str, shared);
1643 rb_enc_cr_str_exact_copy(str, str2);
1644 }
1645 else {
1646 str_replace_shared(str, str2);
1647 }
1648
1649 return str;
1650}
1651
1652static inline VALUE
1653ec_str_alloc_embed(struct rb_execution_context_struct *ec, VALUE klass, size_t capa)
1654{
1655 size_t size = rb_str_embed_size(capa);
1656 assert(size > 0);
1657 assert(rb_gc_size_allocatable_p(size));
1658
1659 NEWOBJ_OF(str, struct RString, klass,
1661
1662 return (VALUE)str;
1663}
1664
1665static inline VALUE
1666ec_str_alloc_heap(struct rb_execution_context_struct *ec, VALUE klass)
1667{
1668 NEWOBJ_OF(str, struct RString, klass,
1669 T_STRING | STR_NOEMBED | (RGENGC_WB_PROTECTED_STRING ? FL_WB_PROTECTED : 0), sizeof(struct RString), ec);
1670
1671 return (VALUE)str;
1672}
1673
1674static inline VALUE
1675str_duplicate_setup(VALUE klass, VALUE str, VALUE dup)
1676{
1677 const VALUE flag_mask =
1679 FL_FREEZE
1680 ;
1681 VALUE flags = FL_TEST_RAW(str, flag_mask);
1682 int encidx = 0;
1683 if (STR_EMBED_P(str)) {
1684 long len = RSTRING_LEN(str);
1685
1686 assert(STR_EMBED_P(dup));
1687 assert(str_embed_capa(dup) >= len + 1);
1688 MEMCPY(RSTRING(dup)->as.embed.ary, RSTRING(str)->as.embed.ary, char, len + 1);
1689 }
1690 else {
1691 VALUE root = str;
1692 if (FL_TEST_RAW(str, STR_SHARED)) {
1693 root = RSTRING(str)->as.heap.aux.shared;
1694 }
1695 else if (UNLIKELY(!(flags & FL_FREEZE))) {
1696 root = str = str_new_frozen(klass, str);
1697 flags = FL_TEST_RAW(str, flag_mask);
1698 }
1699 assert(!STR_SHARED_P(root));
1700 assert(RB_OBJ_FROZEN_RAW(root));
1701
1702 RSTRING(dup)->as.heap.ptr = RSTRING_PTR(str);
1703 FL_SET(root, STR_SHARED_ROOT);
1704 RB_OBJ_WRITE(dup, &RSTRING(dup)->as.heap.aux.shared, root);
1705 flags |= RSTRING_NOEMBED | STR_SHARED;
1706 }
1707
1708 STR_SET_LEN(dup, RSTRING_LEN(str));
1709
1710 if ((flags & ENCODING_MASK) == (ENCODING_INLINE_MAX<<ENCODING_SHIFT)) {
1711 encidx = rb_enc_get_index(str);
1712 flags &= ~ENCODING_MASK;
1713 }
1714 FL_SET_RAW(dup, flags & ~FL_FREEZE);
1715 if (encidx) rb_enc_associate_index(dup, encidx);
1716 return dup;
1717}
1718
1719static inline VALUE
1720ec_str_duplicate(struct rb_execution_context_struct *ec, VALUE klass, VALUE str)
1721{
1722 VALUE dup;
1723 if (STR_EMBED_P(str)) {
1724 dup = ec_str_alloc_embed(ec, klass, RSTRING_LEN(str) + TERM_LEN(str));
1725 }
1726 else {
1727 dup = ec_str_alloc_heap(ec, klass);
1728 }
1729
1730 return str_duplicate_setup(klass, str, dup);
1731}
1732
1733static inline VALUE
1734str_duplicate(VALUE klass, VALUE str)
1735{
1736 VALUE dup;
1737 if (STR_EMBED_P(str)) {
1738 dup = str_alloc_embed(klass, RSTRING_LEN(str) + TERM_LEN(str));
1739 }
1740 else {
1741 dup = str_alloc_heap(klass);
1742 }
1743
1744 return str_duplicate_setup(klass, str, dup);
1745}
1746
1747VALUE
1748rb_str_dup(VALUE str)
1749{
1750 return str_duplicate(rb_obj_class(str), str);
1751}
1752
1753/* :nodoc: */
1754VALUE
1755rb_str_dup_m(VALUE str)
1756{
1757 if (LIKELY(BARE_STRING_P(str))) {
1758 return str_duplicate(rb_obj_class(str), str);
1759 }
1760 else {
1761 return rb_obj_dup(str);
1762 }
1763}
1764
1765VALUE
1767{
1768 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1769 return str_duplicate(rb_cString, str);
1770}
1771
1772VALUE
1773rb_ec_str_resurrect(struct rb_execution_context_struct *ec, VALUE str)
1774{
1775 RUBY_DTRACE_CREATE_HOOK(STRING, RSTRING_LEN(str));
1776 return ec_str_duplicate(ec, rb_cString, str);
1777}
1778
1779/*
1780 *
1781 * call-seq:
1782 * String.new(string = '', **opts) -> new_string
1783 *
1784 * :include: doc/string/new.rdoc
1785 *
1786 */
1787
1788static VALUE
1789rb_str_init(int argc, VALUE *argv, VALUE str)
1790{
1791 static ID keyword_ids[2];
1792 VALUE orig, opt, venc, vcapa;
1793 VALUE kwargs[2];
1794 rb_encoding *enc = 0;
1795 int n;
1796
1797 if (!keyword_ids[0]) {
1798 keyword_ids[0] = rb_id_encoding();
1799 CONST_ID(keyword_ids[1], "capacity");
1800 }
1801
1802 n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1803 if (!NIL_P(opt)) {
1804 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1805 venc = kwargs[0];
1806 vcapa = kwargs[1];
1807 if (!UNDEF_P(venc) && !NIL_P(venc)) {
1808 enc = rb_to_encoding(venc);
1809 }
1810 if (!UNDEF_P(vcapa) && !NIL_P(vcapa)) {
1811 long capa = NUM2LONG(vcapa);
1812 long len = 0;
1813 int termlen = enc ? rb_enc_mbminlen(enc) : 1;
1814
1815 if (capa < STR_BUF_MIN_SIZE) {
1816 capa = STR_BUF_MIN_SIZE;
1817 }
1818 if (n == 1) {
1819 StringValue(orig);
1820 len = RSTRING_LEN(orig);
1821 if (capa < len) {
1822 capa = len;
1823 }
1824 if (orig == str) n = 0;
1825 }
1826 str_modifiable(str);
1827 if (STR_EMBED_P(str)) { /* make noembed always */
1828 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1829 assert(RSTRING_LEN(str) + 1 <= str_embed_capa(str));
1830 memcpy(new_ptr, RSTRING(str)->as.embed.ary, RSTRING_LEN(str) + 1);
1831 RSTRING(str)->as.heap.ptr = new_ptr;
1832 }
1833 else if (FL_TEST(str, STR_SHARED|STR_NOFREE)) {
1834 const size_t size = (size_t)capa + termlen;
1835 const char *const old_ptr = RSTRING_PTR(str);
1836 const size_t osize = RSTRING_LEN(str) + TERM_LEN(str);
1837 char *new_ptr = ALLOC_N(char, (size_t)capa + termlen);
1838 memcpy(new_ptr, old_ptr, osize < size ? osize : size);
1839 FL_UNSET_RAW(str, STR_SHARED|STR_NOFREE);
1840 RSTRING(str)->as.heap.ptr = new_ptr;
1841 }
1842 else if (STR_HEAP_SIZE(str) != (size_t)capa + termlen) {
1843 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
1844 (size_t)capa + termlen, STR_HEAP_SIZE(str));
1845 }
1846 STR_SET_LEN(str, len);
1847 TERM_FILL(&RSTRING(str)->as.heap.ptr[len], termlen);
1848 if (n == 1) {
1849 memcpy(RSTRING(str)->as.heap.ptr, RSTRING_PTR(orig), len);
1850 rb_enc_cr_str_exact_copy(str, orig);
1851 }
1852 FL_SET(str, STR_NOEMBED);
1853 RSTRING(str)->as.heap.aux.capa = capa;
1854 }
1855 else if (n == 1) {
1856 rb_str_replace(str, orig);
1857 }
1858 if (enc) {
1859 rb_enc_associate(str, enc);
1861 }
1862 }
1863 else if (n == 1) {
1864 rb_str_replace(str, orig);
1865 }
1866 return str;
1867}
1868
1869/* :nodoc: */
1870static VALUE
1871rb_str_s_new(int argc, VALUE *argv, VALUE klass)
1872{
1873 if (klass != rb_cString) {
1874 return rb_class_new_instance_pass_kw(argc, argv, klass);
1875 }
1876
1877 static ID keyword_ids[2];
1878 VALUE orig, opt, encoding = Qnil, capacity = Qnil;
1879 VALUE kwargs[2];
1880 rb_encoding *enc = NULL;
1881
1882 int n = rb_scan_args(argc, argv, "01:", &orig, &opt);
1883 if (NIL_P(opt)) {
1884 return rb_class_new_instance_pass_kw(argc, argv, klass);
1885 }
1886
1887 keyword_ids[0] = rb_id_encoding();
1888 CONST_ID(keyword_ids[1], "capacity");
1889 rb_get_kwargs(opt, keyword_ids, 0, 2, kwargs);
1890 encoding = kwargs[0];
1891 capacity = kwargs[1];
1892
1893 int termlen = 1;
1894
1895 if (n == 1) {
1896 orig = StringValue(orig);
1897 }
1898 else {
1899 orig = Qnil;
1900 }
1901
1902 if (UNDEF_P(encoding)) {
1903 if (!NIL_P(orig)) {
1904 encoding = rb_obj_encoding(orig);
1905 }
1906 }
1907
1908 if (!UNDEF_P(encoding)) {
1909 enc = rb_to_encoding(encoding);
1910 termlen = rb_enc_mbminlen(enc);
1911 }
1912
1913 // If capacity is nil, we're basically just duping `orig`.
1914 if (UNDEF_P(capacity)) {
1915 if (NIL_P(orig)) {
1916 VALUE empty_str = str_new(klass, "", 0);
1917 if (enc) {
1918 rb_enc_associate(empty_str, enc);
1919 }
1920 return empty_str;
1921 }
1922 VALUE copy = str_duplicate(klass, orig);
1923 rb_enc_associate(copy, enc);
1924 ENC_CODERANGE_CLEAR(copy);
1925 return copy;
1926 }
1927
1928 long capa = 0;
1929 capa = NUM2LONG(capacity);
1930 if (capa < 0) {
1931 capa = 0;
1932 }
1933
1934 if (!NIL_P(orig)) {
1935 long orig_capa = rb_str_capacity(orig);
1936 if (orig_capa > capa) {
1937 capa = orig_capa;
1938 }
1939 }
1940
1941 long fake_len = capa - termlen;
1942 if (fake_len < 0) {
1943 fake_len = 0;
1944 }
1945
1946 VALUE str = str_new0(klass, NULL, fake_len, termlen);
1947 STR_SET_LEN(str, 0);
1948 TERM_FILL(RSTRING_PTR(str), termlen);
1949
1950 if (enc) {
1951 rb_enc_associate(str, enc);
1952 }
1953
1954 if (!NIL_P(orig)) {
1955 rb_str_buf_append(str, orig);
1956 }
1957
1958 return str;
1959}
1960
1961#ifdef NONASCII_MASK
1962#define is_utf8_lead_byte(c) (((c)&0xC0) != 0x80)
1963
1964/*
1965 * UTF-8 leading bytes have either 0xxxxxxx or 11xxxxxx
1966 * bit representation. (see https://en.wikipedia.org/wiki/UTF-8)
1967 * Therefore, the following pseudocode can detect UTF-8 leading bytes.
1968 *
1969 * if (!(byte & 0x80))
1970 * byte |= 0x40; // turn on bit6
1971 * return ((byte>>6) & 1); // bit6 represent whether this byte is leading or not.
1972 *
1973 * This function calculates whether a byte is leading or not for all bytes
1974 * in the argument word by concurrently using the above logic, and then
1975 * adds up the number of leading bytes in the word.
1976 */
1977static inline uintptr_t
1978count_utf8_lead_bytes_with_word(const uintptr_t *s)
1979{
1980 uintptr_t d = *s;
1981
1982 /* Transform so that bit0 indicates whether we have a UTF-8 leading byte or not. */
1983 d = (d>>6) | (~d>>7);
1984 d &= NONASCII_MASK >> 7;
1985
1986 /* Gather all bytes. */
1987#if defined(HAVE_BUILTIN___BUILTIN_POPCOUNT) && defined(__POPCNT__)
1988 /* use only if it can use POPCNT */
1989 return rb_popcount_intptr(d);
1990#else
1991 d += (d>>8);
1992 d += (d>>16);
1993# if SIZEOF_VOIDP == 8
1994 d += (d>>32);
1995# endif
1996 return (d&0xF);
1997#endif
1998}
1999#endif
2000
2001static inline long
2002enc_strlen(const char *p, const char *e, rb_encoding *enc, int cr)
2003{
2004 long c;
2005 const char *q;
2006
2007 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2008 long diff = (long)(e - p);
2009 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2010 }
2011#ifdef NONASCII_MASK
2012 else if (cr == ENC_CODERANGE_VALID && enc == rb_utf8_encoding()) {
2013 uintptr_t len = 0;
2014 if ((int)sizeof(uintptr_t) * 2 < e - p) {
2015 const uintptr_t *s, *t;
2016 const uintptr_t lowbits = sizeof(uintptr_t) - 1;
2017 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2018 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2019 while (p < (const char *)s) {
2020 if (is_utf8_lead_byte(*p)) len++;
2021 p++;
2022 }
2023 while (s < t) {
2024 len += count_utf8_lead_bytes_with_word(s);
2025 s++;
2026 }
2027 p = (const char *)s;
2028 }
2029 while (p < e) {
2030 if (is_utf8_lead_byte(*p)) len++;
2031 p++;
2032 }
2033 return (long)len;
2034 }
2035#endif
2036 else if (rb_enc_asciicompat(enc)) {
2037 c = 0;
2038 if (ENC_CODERANGE_CLEAN_P(cr)) {
2039 while (p < e) {
2040 if (ISASCII(*p)) {
2041 q = search_nonascii(p, e);
2042 if (!q)
2043 return c + (e - p);
2044 c += q - p;
2045 p = q;
2046 }
2047 p += rb_enc_fast_mbclen(p, e, enc);
2048 c++;
2049 }
2050 }
2051 else {
2052 while (p < e) {
2053 if (ISASCII(*p)) {
2054 q = search_nonascii(p, e);
2055 if (!q)
2056 return c + (e - p);
2057 c += q - p;
2058 p = q;
2059 }
2060 p += rb_enc_mbclen(p, e, enc);
2061 c++;
2062 }
2063 }
2064 return c;
2065 }
2066
2067 for (c=0; p<e; c++) {
2068 p += rb_enc_mbclen(p, e, enc);
2069 }
2070 return c;
2071}
2072
2073long
2074rb_enc_strlen(const char *p, const char *e, rb_encoding *enc)
2075{
2076 return enc_strlen(p, e, enc, ENC_CODERANGE_UNKNOWN);
2077}
2078
2079/* To get strlen with cr
2080 * Note that given cr is not used.
2081 */
2082long
2083rb_enc_strlen_cr(const char *p, const char *e, rb_encoding *enc, int *cr)
2084{
2085 long c;
2086 const char *q;
2087 int ret;
2088
2089 *cr = 0;
2090 if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2091 long diff = (long)(e - p);
2092 return diff / rb_enc_mbminlen(enc) + !!(diff % rb_enc_mbminlen(enc));
2093 }
2094 else if (rb_enc_asciicompat(enc)) {
2095 c = 0;
2096 while (p < e) {
2097 if (ISASCII(*p)) {
2098 q = search_nonascii(p, e);
2099 if (!q) {
2100 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2101 return c + (e - p);
2102 }
2103 c += q - p;
2104 p = q;
2105 }
2106 ret = rb_enc_precise_mbclen(p, e, enc);
2107 if (MBCLEN_CHARFOUND_P(ret)) {
2108 *cr |= ENC_CODERANGE_VALID;
2109 p += MBCLEN_CHARFOUND_LEN(ret);
2110 }
2111 else {
2113 p++;
2114 }
2115 c++;
2116 }
2117 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2118 return c;
2119 }
2120
2121 for (c=0; p<e; c++) {
2122 ret = rb_enc_precise_mbclen(p, e, enc);
2123 if (MBCLEN_CHARFOUND_P(ret)) {
2124 *cr |= ENC_CODERANGE_VALID;
2125 p += MBCLEN_CHARFOUND_LEN(ret);
2126 }
2127 else {
2129 if (p + rb_enc_mbminlen(enc) <= e)
2130 p += rb_enc_mbminlen(enc);
2131 else
2132 p = e;
2133 }
2134 }
2135 if (!*cr) *cr = ENC_CODERANGE_7BIT;
2136 return c;
2137}
2138
2139/* enc must be str's enc or rb_enc_check(str, str2) */
2140static long
2141str_strlen(VALUE str, rb_encoding *enc)
2142{
2143 const char *p, *e;
2144 int cr;
2145
2146 if (single_byte_optimizable(str)) return RSTRING_LEN(str);
2147 if (!enc) enc = STR_ENC_GET(str);
2148 p = RSTRING_PTR(str);
2149 e = RSTRING_END(str);
2150 cr = ENC_CODERANGE(str);
2151
2152 if (cr == ENC_CODERANGE_UNKNOWN) {
2153 long n = rb_enc_strlen_cr(p, e, enc, &cr);
2154 if (cr) ENC_CODERANGE_SET(str, cr);
2155 return n;
2156 }
2157 else {
2158 return enc_strlen(p, e, enc, cr);
2159 }
2160}
2161
2162long
2164{
2165 return str_strlen(str, NULL);
2166}
2167
2168/*
2169 * call-seq:
2170 * length -> integer
2171 *
2172 * :include: doc/string/length.rdoc
2173 *
2174 */
2175
2176VALUE
2178{
2179 return LONG2NUM(str_strlen(str, NULL));
2180}
2181
2182/*
2183 * call-seq:
2184 * bytesize -> integer
2185 *
2186 * :include: doc/string/bytesize.rdoc
2187 *
2188 */
2189
2190VALUE
2191rb_str_bytesize(VALUE str)
2192{
2193 return LONG2NUM(RSTRING_LEN(str));
2194}
2195
2196/*
2197 * call-seq:
2198 * empty? -> true or false
2199 *
2200 * Returns +true+ if the length of +self+ is zero, +false+ otherwise:
2201 *
2202 * "hello".empty? # => false
2203 * " ".empty? # => false
2204 * "".empty? # => true
2205 *
2206 */
2207
2208static VALUE
2209rb_str_empty(VALUE str)
2210{
2211 return RBOOL(RSTRING_LEN(str) == 0);
2212}
2213
2214/*
2215 * call-seq:
2216 * string + other_string -> new_string
2217 *
2218 * Returns a new \String containing +other_string+ concatenated to +self+:
2219 *
2220 * "Hello from " + self.to_s # => "Hello from main"
2221 *
2222 */
2223
2224VALUE
2226{
2227 VALUE str3;
2228 rb_encoding *enc;
2229 char *ptr1, *ptr2, *ptr3;
2230 long len1, len2;
2231 int termlen;
2232
2233 StringValue(str2);
2234 enc = rb_enc_check_str(str1, str2);
2235 RSTRING_GETMEM(str1, ptr1, len1);
2236 RSTRING_GETMEM(str2, ptr2, len2);
2237 termlen = rb_enc_mbminlen(enc);
2238 if (len1 > LONG_MAX - len2) {
2239 rb_raise(rb_eArgError, "string size too big");
2240 }
2241 str3 = str_new0(rb_cString, 0, len1+len2, termlen);
2242 ptr3 = RSTRING_PTR(str3);
2243 memcpy(ptr3, ptr1, len1);
2244 memcpy(ptr3+len1, ptr2, len2);
2245 TERM_FILL(&ptr3[len1+len2], termlen);
2246
2247 ENCODING_CODERANGE_SET(str3, rb_enc_to_index(enc),
2249 RB_GC_GUARD(str1);
2250 RB_GC_GUARD(str2);
2251 return str3;
2252}
2253
2254/* A variant of rb_str_plus that does not raise but return Qundef instead. */
2255VALUE
2256rb_str_opt_plus(VALUE str1, VALUE str2)
2257{
2258 assert(RBASIC_CLASS(str1) == rb_cString);
2259 assert(RBASIC_CLASS(str2) == rb_cString);
2260 long len1, len2;
2261 MAYBE_UNUSED(char) *ptr1, *ptr2;
2262 RSTRING_GETMEM(str1, ptr1, len1);
2263 RSTRING_GETMEM(str2, ptr2, len2);
2264 int enc1 = rb_enc_get_index(str1);
2265 int enc2 = rb_enc_get_index(str2);
2266
2267 if (enc1 < 0) {
2268 return Qundef;
2269 }
2270 else if (enc2 < 0) {
2271 return Qundef;
2272 }
2273 else if (enc1 != enc2) {
2274 return Qundef;
2275 }
2276 else if (len1 > LONG_MAX - len2) {
2277 return Qundef;
2278 }
2279 else {
2280 return rb_str_plus(str1, str2);
2281 }
2282
2283}
2284
2285/*
2286 * call-seq:
2287 * string * integer -> new_string
2288 *
2289 * Returns a new \String containing +integer+ copies of +self+:
2290 *
2291 * "Ho! " * 3 # => "Ho! Ho! Ho! "
2292 * "Ho! " * 0 # => ""
2293 *
2294 */
2295
2296VALUE
2298{
2299 VALUE str2;
2300 long n, len;
2301 char *ptr2;
2302 int termlen;
2303
2304 if (times == INT2FIX(1)) {
2305 return str_duplicate(rb_cString, str);
2306 }
2307 if (times == INT2FIX(0)) {
2308 str2 = str_alloc_embed(rb_cString, 0);
2309 rb_enc_copy(str2, str);
2310 return str2;
2311 }
2312 len = NUM2LONG(times);
2313 if (len < 0) {
2314 rb_raise(rb_eArgError, "negative argument");
2315 }
2316 if (RSTRING_LEN(str) == 1 && RSTRING_PTR(str)[0] == 0) {
2317 if (STR_EMBEDDABLE_P(len, 1)) {
2318 str2 = str_alloc_embed(rb_cString, len + 1);
2319 memset(RSTRING_PTR(str2), 0, len + 1);
2320 }
2321 else {
2322 str2 = str_alloc_heap(rb_cString);
2323 RSTRING(str2)->as.heap.aux.capa = len;
2324 RSTRING(str2)->as.heap.ptr = ZALLOC_N(char, (size_t)len + 1);
2325 }
2326 STR_SET_LEN(str2, len);
2327 rb_enc_copy(str2, str);
2328 return str2;
2329 }
2330 if (len && LONG_MAX/len < RSTRING_LEN(str)) {
2331 rb_raise(rb_eArgError, "argument too big");
2332 }
2333
2334 len *= RSTRING_LEN(str);
2335 termlen = TERM_LEN(str);
2336 str2 = str_new0(rb_cString, 0, len, termlen);
2337 ptr2 = RSTRING_PTR(str2);
2338 if (len) {
2339 n = RSTRING_LEN(str);
2340 memcpy(ptr2, RSTRING_PTR(str), n);
2341 while (n <= len/2) {
2342 memcpy(ptr2 + n, ptr2, n);
2343 n *= 2;
2344 }
2345 memcpy(ptr2 + n, ptr2, len-n);
2346 }
2347 STR_SET_LEN(str2, len);
2348 TERM_FILL(&ptr2[len], termlen);
2349 rb_enc_cr_str_copy_for_substr(str2, str);
2350
2351 return str2;
2352}
2353
2354/*
2355 * call-seq:
2356 * string % object -> new_string
2357 *
2358 * Returns the result of formatting +object+ into the format specification +self+
2359 * (see Kernel#sprintf for formatting details):
2360 *
2361 * "%05d" % 123 # => "00123"
2362 *
2363 * If +self+ contains multiple substitutions, +object+ must be
2364 * an Array or Hash containing the values to be substituted:
2365 *
2366 * "%-5s: %016x" % [ "ID", self.object_id ] # => "ID : 00002b054ec93168"
2367 * "foo = %{foo}" % {foo: 'bar'} # => "foo = bar"
2368 * "foo = %{foo}, baz = %{baz}" % {foo: 'bar', baz: 'bat'} # => "foo = bar, baz = bat"
2369 *
2370 */
2371
2372static VALUE
2373rb_str_format_m(VALUE str, VALUE arg)
2374{
2375 VALUE tmp = rb_check_array_type(arg);
2376
2377 if (!NIL_P(tmp)) {
2378 return rb_str_format(RARRAY_LENINT(tmp), RARRAY_CONST_PTR(tmp), str);
2379 }
2380 return rb_str_format(1, &arg, str);
2381}
2382
2383static inline void
2384rb_check_lockedtmp(VALUE str)
2385{
2386 if (FL_TEST(str, STR_TMPLOCK)) {
2387 rb_raise(rb_eRuntimeError, "can't modify string; temporarily locked");
2388 }
2389}
2390
2391static inline void
2392str_modifiable(VALUE str)
2393{
2394 rb_check_lockedtmp(str);
2395 rb_check_frozen(str);
2396}
2397
2398static inline int
2399str_dependent_p(VALUE str)
2400{
2401 if (STR_EMBED_P(str) || !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2402 return 0;
2403 }
2404 else {
2405 return 1;
2406 }
2407}
2408
2409static inline int
2410str_independent(VALUE str)
2411{
2412 str_modifiable(str);
2413 return !str_dependent_p(str);
2414}
2415
2416static void
2417str_make_independent_expand(VALUE str, long len, long expand, const int termlen)
2418{
2419 char *ptr;
2420 char *oldptr;
2421 long capa = len + expand;
2422
2423 if (len > capa) len = capa;
2424
2425 if (!STR_EMBED_P(str) && str_embed_capa(str) >= capa + termlen) {
2426 ptr = RSTRING(str)->as.heap.ptr;
2427 STR_SET_EMBED(str);
2428 memcpy(RSTRING(str)->as.embed.ary, ptr, len);
2429 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
2430 STR_SET_LEN(str, len);
2431 return;
2432 }
2433
2434 ptr = ALLOC_N(char, (size_t)capa + termlen);
2435 oldptr = RSTRING_PTR(str);
2436 if (oldptr) {
2437 memcpy(ptr, oldptr, len);
2438 }
2439 if (FL_TEST_RAW(str, STR_NOEMBED|STR_NOFREE|STR_SHARED) == STR_NOEMBED) {
2440 xfree(oldptr);
2441 }
2442 STR_SET_NOEMBED(str);
2443 FL_UNSET(str, STR_SHARED|STR_NOFREE);
2444 TERM_FILL(ptr + len, termlen);
2445 RSTRING(str)->as.heap.ptr = ptr;
2446 STR_SET_LEN(str, len);
2447 RSTRING(str)->as.heap.aux.capa = capa;
2448}
2449
2450void
2451rb_str_modify(VALUE str)
2452{
2453 if (!str_independent(str))
2454 str_make_independent(str);
2456}
2457
2458void
2460{
2461 int termlen = TERM_LEN(str);
2462 long len = RSTRING_LEN(str);
2463
2464 if (expand < 0) {
2465 rb_raise(rb_eArgError, "negative expanding string size");
2466 }
2467 if (expand >= LONG_MAX - len) {
2468 rb_raise(rb_eArgError, "string size too big");
2469 }
2470
2471 if (!str_independent(str)) {
2472 str_make_independent_expand(str, len, expand, termlen);
2473 }
2474 else if (expand > 0) {
2475 RESIZE_CAPA_TERM(str, len + expand, termlen);
2476 }
2478}
2479
2480/* As rb_str_modify(), but don't clear coderange */
2481static void
2482str_modify_keep_cr(VALUE str)
2483{
2484 if (!str_independent(str))
2485 str_make_independent(str);
2487 /* Force re-scan later */
2489}
2490
2491static inline void
2492str_discard(VALUE str)
2493{
2494 str_modifiable(str);
2495 if (!STR_EMBED_P(str) && !FL_TEST(str, STR_SHARED|STR_NOFREE)) {
2496 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
2497 RSTRING(str)->as.heap.ptr = 0;
2498 STR_SET_LEN(str, 0);
2499 }
2500}
2501
2502void
2504{
2505 rb_encoding *enc = rb_enc_get(str);
2506 if (!enc) {
2507 rb_raise(rb_eTypeError, "not encoding capable object");
2508 }
2509 if (!rb_enc_asciicompat(enc)) {
2510 rb_raise(rb_eEncCompatError, "ASCII incompatible encoding: %s", rb_enc_name(enc));
2511 }
2512}
2513
2514VALUE
2516{
2517 VALUE s = *ptr;
2518 if (!RB_TYPE_P(s, T_STRING)) {
2519 s = rb_str_to_str(s);
2520 *ptr = s;
2521 }
2522 return s;
2523}
2524
2525char *
2527{
2528 VALUE str = rb_string_value(ptr);
2529 return RSTRING_PTR(str);
2530}
2531
2532static int
2533zero_filled(const char *s, int n)
2534{
2535 for (; n > 0; --n) {
2536 if (*s++) return 0;
2537 }
2538 return 1;
2539}
2540
2541static const char *
2542str_null_char(const char *s, long len, const int minlen, rb_encoding *enc)
2543{
2544 const char *e = s + len;
2545
2546 for (; s + minlen <= e; s += rb_enc_mbclen(s, e, enc)) {
2547 if (zero_filled(s, minlen)) return s;
2548 }
2549 return 0;
2550}
2551
2552static char *
2553str_fill_term(VALUE str, char *s, long len, int termlen)
2554{
2555 /* This function assumes that (capa + termlen) bytes of memory
2556 * is allocated, like many other functions in this file.
2557 */
2558 if (str_dependent_p(str)) {
2559 if (!zero_filled(s + len, termlen))
2560 str_make_independent_expand(str, len, 0L, termlen);
2561 }
2562 else {
2563 TERM_FILL(s + len, termlen);
2564 return s;
2565 }
2566 return RSTRING_PTR(str);
2567}
2568
2569void
2570rb_str_change_terminator_length(VALUE str, const int oldtermlen, const int termlen)
2571{
2572 long capa = str_capacity(str, oldtermlen) + oldtermlen;
2573 long len = RSTRING_LEN(str);
2574
2575 assert(capa >= len);
2576 if (capa - len < termlen) {
2577 rb_check_lockedtmp(str);
2578 str_make_independent_expand(str, len, 0L, termlen);
2579 }
2580 else if (str_dependent_p(str)) {
2581 if (termlen > oldtermlen)
2582 str_make_independent_expand(str, len, 0L, termlen);
2583 }
2584 else {
2585 if (!STR_EMBED_P(str)) {
2586 /* modify capa instead of realloc */
2587 assert(!FL_TEST((str), STR_SHARED));
2588 RSTRING(str)->as.heap.aux.capa = capa - termlen;
2589 }
2590 if (termlen > oldtermlen) {
2591 TERM_FILL(RSTRING_PTR(str) + len, termlen);
2592 }
2593 }
2594
2595 return;
2596}
2597
2598static char *
2599str_null_check(VALUE str, int *w)
2600{
2601 char *s = RSTRING_PTR(str);
2602 long len = RSTRING_LEN(str);
2603 rb_encoding *enc = rb_enc_get(str);
2604 const int minlen = rb_enc_mbminlen(enc);
2605
2606 if (minlen > 1) {
2607 *w = 1;
2608 if (str_null_char(s, len, minlen, enc)) {
2609 return NULL;
2610 }
2611 return str_fill_term(str, s, len, minlen);
2612 }
2613 *w = 0;
2614 if (!s || memchr(s, 0, len)) {
2615 return NULL;
2616 }
2617 if (s[len]) {
2618 s = str_fill_term(str, s, len, minlen);
2619 }
2620 return s;
2621}
2622
2623char *
2624rb_str_to_cstr(VALUE str)
2625{
2626 int w;
2627 return str_null_check(str, &w);
2628}
2629
2630char *
2632{
2633 VALUE str = rb_string_value(ptr);
2634 int w;
2635 char *s = str_null_check(str, &w);
2636 if (!s) {
2637 if (w) {
2638 rb_raise(rb_eArgError, "string contains null char");
2639 }
2640 rb_raise(rb_eArgError, "string contains null byte");
2641 }
2642 return s;
2643}
2644
2645char *
2646rb_str_fill_terminator(VALUE str, const int newminlen)
2647{
2648 char *s = RSTRING_PTR(str);
2649 long len = RSTRING_LEN(str);
2650 return str_fill_term(str, s, len, newminlen);
2651}
2652
2653VALUE
2655{
2656 str = rb_check_convert_type_with_id(str, T_STRING, "String", idTo_str);
2657 return str;
2658}
2659
2660/*
2661 * call-seq:
2662 * String.try_convert(object) -> object, new_string, or nil
2663 *
2664 * If +object+ is a \String object, returns +object+.
2665 *
2666 * Otherwise if +object+ responds to <tt>:to_str</tt>,
2667 * calls <tt>object.to_str</tt> and returns the result.
2668 *
2669 * Returns +nil+ if +object+ does not respond to <tt>:to_str</tt>.
2670 *
2671 * Raises an exception unless <tt>object.to_str</tt> returns a \String object.
2672 */
2673static VALUE
2674rb_str_s_try_convert(VALUE dummy, VALUE str)
2675{
2676 return rb_check_string_type(str);
2677}
2678
2679static char*
2680str_nth_len(const char *p, const char *e, long *nthp, rb_encoding *enc)
2681{
2682 long nth = *nthp;
2683 if (rb_enc_mbmaxlen(enc) == 1) {
2684 p += nth;
2685 }
2686 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2687 p += nth * rb_enc_mbmaxlen(enc);
2688 }
2689 else if (rb_enc_asciicompat(enc)) {
2690 const char *p2, *e2;
2691 int n;
2692
2693 while (p < e && 0 < nth) {
2694 e2 = p + nth;
2695 if (e < e2) {
2696 *nthp = nth;
2697 return (char *)e;
2698 }
2699 if (ISASCII(*p)) {
2700 p2 = search_nonascii(p, e2);
2701 if (!p2) {
2702 nth -= e2 - p;
2703 *nthp = nth;
2704 return (char *)e2;
2705 }
2706 nth -= p2 - p;
2707 p = p2;
2708 }
2709 n = rb_enc_mbclen(p, e, enc);
2710 p += n;
2711 nth--;
2712 }
2713 *nthp = nth;
2714 if (nth != 0) {
2715 return (char *)e;
2716 }
2717 return (char *)p;
2718 }
2719 else {
2720 while (p < e && nth--) {
2721 p += rb_enc_mbclen(p, e, enc);
2722 }
2723 }
2724 if (p > e) p = e;
2725 *nthp = nth;
2726 return (char*)p;
2727}
2728
2729char*
2730rb_enc_nth(const char *p, const char *e, long nth, rb_encoding *enc)
2731{
2732 return str_nth_len(p, e, &nth, enc);
2733}
2734
2735static char*
2736str_nth(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2737{
2738 if (singlebyte)
2739 p += nth;
2740 else {
2741 p = str_nth_len(p, e, &nth, enc);
2742 }
2743 if (!p) return 0;
2744 if (p > e) p = e;
2745 return (char *)p;
2746}
2747
2748/* char offset to byte offset */
2749static long
2750str_offset(const char *p, const char *e, long nth, rb_encoding *enc, int singlebyte)
2751{
2752 const char *pp = str_nth(p, e, nth, enc, singlebyte);
2753 if (!pp) return e - p;
2754 return pp - p;
2755}
2756
2757long
2758rb_str_offset(VALUE str, long pos)
2759{
2760 return str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
2761 STR_ENC_GET(str), single_byte_optimizable(str));
2762}
2763
2764#ifdef NONASCII_MASK
2765static char *
2766str_utf8_nth(const char *p, const char *e, long *nthp)
2767{
2768 long nth = *nthp;
2769 if ((int)SIZEOF_VOIDP * 2 < e - p && (int)SIZEOF_VOIDP * 2 < nth) {
2770 const uintptr_t *s, *t;
2771 const uintptr_t lowbits = SIZEOF_VOIDP - 1;
2772 s = (const uintptr_t*)(~lowbits & ((uintptr_t)p + lowbits));
2773 t = (const uintptr_t*)(~lowbits & (uintptr_t)e);
2774 while (p < (const char *)s) {
2775 if (is_utf8_lead_byte(*p)) nth--;
2776 p++;
2777 }
2778 do {
2779 nth -= count_utf8_lead_bytes_with_word(s);
2780 s++;
2781 } while (s < t && (int)SIZEOF_VOIDP <= nth);
2782 p = (char *)s;
2783 }
2784 while (p < e) {
2785 if (is_utf8_lead_byte(*p)) {
2786 if (nth == 0) break;
2787 nth--;
2788 }
2789 p++;
2790 }
2791 *nthp = nth;
2792 return (char *)p;
2793}
2794
2795static long
2796str_utf8_offset(const char *p, const char *e, long nth)
2797{
2798 const char *pp = str_utf8_nth(p, e, &nth);
2799 return pp - p;
2800}
2801#endif
2802
2803/* byte offset to char offset */
2804long
2805rb_str_sublen(VALUE str, long pos)
2806{
2807 if (single_byte_optimizable(str) || pos < 0)
2808 return pos;
2809 else {
2810 char *p = RSTRING_PTR(str);
2811 return enc_strlen(p, p + pos, STR_ENC_GET(str), ENC_CODERANGE(str));
2812 }
2813}
2814
2815static VALUE
2816str_subseq(VALUE str, long beg, long len)
2817{
2818 VALUE str2;
2819
2820 assert(beg >= 0);
2821 assert(len >= 0);
2822 assert(beg+len <= RSTRING_LEN(str));
2823
2824 const int termlen = TERM_LEN(str);
2825 if (!SHARABLE_SUBSTRING_P(beg, len, RSTRING_LEN(str))) {
2826 str2 = rb_str_new(RSTRING_PTR(str) + beg, len);
2827 RB_GC_GUARD(str);
2828 return str2;
2829 }
2830
2831 str2 = str_alloc_heap(rb_cString);
2832 if (str_embed_capa(str2) >= len + termlen) {
2833 char *ptr2 = RSTRING(str2)->as.embed.ary;
2834 STR_SET_EMBED(str2);
2835 memcpy(ptr2, RSTRING_PTR(str) + beg, len);
2836 TERM_FILL(ptr2+len, termlen);
2837
2838 STR_SET_LEN(str2, len);
2839 RB_GC_GUARD(str);
2840 }
2841 else {
2842 str_replace_shared(str2, str);
2843 assert(!STR_EMBED_P(str2));
2844 ENC_CODERANGE_CLEAR(str2);
2845 RSTRING(str2)->as.heap.ptr += beg;
2846 if (RSTRING_LEN(str2) > len) {
2847 STR_SET_LEN(str2, len);
2848 }
2849 }
2850
2851 return str2;
2852}
2853
2854VALUE
2855rb_str_subseq(VALUE str, long beg, long len)
2856{
2857 VALUE str2 = str_subseq(str, beg, len);
2858 rb_enc_cr_str_copy_for_substr(str2, str);
2859 return str2;
2860}
2861
2862char *
2863rb_str_subpos(VALUE str, long beg, long *lenp)
2864{
2865 long len = *lenp;
2866 long slen = -1L;
2867 long blen = RSTRING_LEN(str);
2868 rb_encoding *enc = STR_ENC_GET(str);
2869 char *p, *s = RSTRING_PTR(str), *e = s + blen;
2870
2871 if (len < 0) return 0;
2872 if (!blen) {
2873 len = 0;
2874 }
2875 if (single_byte_optimizable(str)) {
2876 if (beg > blen) return 0;
2877 if (beg < 0) {
2878 beg += blen;
2879 if (beg < 0) return 0;
2880 }
2881 if (len > blen - beg)
2882 len = blen - beg;
2883 if (len < 0) return 0;
2884 p = s + beg;
2885 goto end;
2886 }
2887 if (beg < 0) {
2888 if (len > -beg) len = -beg;
2889 if (-beg * rb_enc_mbmaxlen(enc) < RSTRING_LEN(str) / 8) {
2890 beg = -beg;
2891 while (beg-- > len && (e = rb_enc_prev_char(s, e, e, enc)) != 0);
2892 p = e;
2893 if (!p) return 0;
2894 while (len-- > 0 && (p = rb_enc_prev_char(s, p, e, enc)) != 0);
2895 if (!p) return 0;
2896 len = e - p;
2897 goto end;
2898 }
2899 else {
2900 slen = str_strlen(str, enc);
2901 beg += slen;
2902 if (beg < 0) return 0;
2903 p = s + beg;
2904 if (len == 0) goto end;
2905 }
2906 }
2907 else if (beg > 0 && beg > RSTRING_LEN(str)) {
2908 return 0;
2909 }
2910 if (len == 0) {
2911 if (beg > str_strlen(str, enc)) return 0; /* str's enc */
2912 p = s + beg;
2913 }
2914#ifdef NONASCII_MASK
2915 else if (ENC_CODERANGE(str) == ENC_CODERANGE_VALID &&
2916 enc == rb_utf8_encoding()) {
2917 p = str_utf8_nth(s, e, &beg);
2918 if (beg > 0) return 0;
2919 len = str_utf8_offset(p, e, len);
2920 }
2921#endif
2922 else if (rb_enc_mbmaxlen(enc) == rb_enc_mbminlen(enc)) {
2923 int char_sz = rb_enc_mbmaxlen(enc);
2924
2925 p = s + beg * char_sz;
2926 if (p > e) {
2927 return 0;
2928 }
2929 else if (len * char_sz > e - p)
2930 len = e - p;
2931 else
2932 len *= char_sz;
2933 }
2934 else if ((p = str_nth_len(s, e, &beg, enc)) == e) {
2935 if (beg > 0) return 0;
2936 len = 0;
2937 }
2938 else {
2939 len = str_offset(p, e, len, enc, 0);
2940 }
2941 end:
2942 *lenp = len;
2943 RB_GC_GUARD(str);
2944 return p;
2945}
2946
2947static VALUE str_substr(VALUE str, long beg, long len, int empty);
2948
2949VALUE
2950rb_str_substr(VALUE str, long beg, long len)
2951{
2952 return str_substr(str, beg, len, TRUE);
2953}
2954
2955static VALUE
2956str_substr(VALUE str, long beg, long len, int empty)
2957{
2958 char *p = rb_str_subpos(str, beg, &len);
2959
2960 if (!p) return Qnil;
2961 if (!len && !empty) return Qnil;
2962
2963 beg = p - RSTRING_PTR(str);
2964
2965 VALUE str2 = str_subseq(str, beg, len);
2966 rb_enc_cr_str_copy_for_substr(str2, str);
2967 return str2;
2968}
2969
2970/* :nodoc: */
2971VALUE
2973{
2974 if (OBJ_FROZEN(str)) return str;
2975 rb_str_resize(str, RSTRING_LEN(str));
2976 return rb_obj_freeze(str);
2977}
2978
2979
2980/*
2981 * call-seq:
2982 * +string -> new_string or self
2983 *
2984 * Returns +self+ if +self+ is not frozen.
2985 *
2986 * Otherwise returns <tt>self.dup</tt>, which is not frozen.
2987 */
2988static VALUE
2989str_uplus(VALUE str)
2990{
2991 if (OBJ_FROZEN(str)) {
2992 return rb_str_dup(str);
2993 }
2994 else {
2995 return str;
2996 }
2997}
2998
2999/*
3000 * call-seq:
3001 * -string -> frozen_string
3002 * dedup -> frozen_string
3003 *
3004 * Returns a frozen, possibly pre-existing copy of the string.
3005 *
3006 * The returned \String will be deduplicated as long as it does not have
3007 * any instance variables set on it and is not a String subclass.
3008 *
3009 * Note that <tt>-string</tt> variant is more convenient for defining
3010 * constants:
3011 *
3012 * FILENAME = -'config/database.yml'
3013 *
3014 * while +dedup+ is better suitable for using the method in chains
3015 * of calculations:
3016 *
3017 * @url_list.concat(urls.map(&:dedup))
3018 *
3019 */
3020static VALUE
3021str_uminus(VALUE str)
3022{
3023 if (!BARE_STRING_P(str) && !rb_obj_frozen_p(str)) {
3024 str = rb_str_dup(str);
3025 }
3026 return rb_fstring(str);
3027}
3028
3029RUBY_ALIAS_FUNCTION(rb_str_dup_frozen(VALUE str), rb_str_new_frozen, (str))
3030#define rb_str_dup_frozen rb_str_new_frozen
3031
3032VALUE
3034{
3035 if (FL_TEST(str, STR_TMPLOCK)) {
3036 rb_raise(rb_eRuntimeError, "temporal locking already locked string");
3037 }
3038 FL_SET(str, STR_TMPLOCK);
3039 return str;
3040}
3041
3042VALUE
3044{
3045 if (!FL_TEST(str, STR_TMPLOCK)) {
3046 rb_raise(rb_eRuntimeError, "temporal unlocking already unlocked string");
3047 }
3048 FL_UNSET(str, STR_TMPLOCK);
3049 return str;
3050}
3051
3052RUBY_FUNC_EXPORTED VALUE
3053rb_str_locktmp_ensure(VALUE str, VALUE (*func)(VALUE), VALUE arg)
3054{
3055 rb_str_locktmp(str);
3056 return rb_ensure(func, arg, rb_str_unlocktmp, str);
3057}
3058
3059void
3060rb_str_set_len(VALUE str, long len)
3061{
3062 long capa;
3063 const int termlen = TERM_LEN(str);
3064
3065 str_modifiable(str);
3066 if (STR_SHARED_P(str)) {
3067 rb_raise(rb_eRuntimeError, "can't set length of shared string");
3068 }
3069 if (len > (capa = (long)str_capacity(str, termlen)) || len < 0) {
3070 rb_bug("probable buffer overflow: %ld for %ld", len, capa);
3071 }
3072
3073 int cr = ENC_CODERANGE(str);
3074 if (cr == ENC_CODERANGE_UNKNOWN) {
3075 /* Leave unknown. */
3076 }
3077 else if (len > RSTRING_LEN(str)) {
3078 if (ENC_CODERANGE_CLEAN_P(cr)) {
3079 /* Update the coderange regarding the extended part. */
3080 const char *const prev_end = RSTRING_END(str);
3081 const char *const new_end = RSTRING_PTR(str) + len;
3082 rb_encoding *enc = rb_enc_get(str);
3083 rb_str_coderange_scan_restartable(prev_end, new_end, enc, &cr);
3084 ENC_CODERANGE_SET(str, cr);
3085 }
3086 else if (cr == ENC_CODERANGE_BROKEN) {
3087 /* May be valid now, by appended part. */
3089 }
3090 }
3091 else if (len < RSTRING_LEN(str)) {
3092 if (cr != ENC_CODERANGE_7BIT) {
3093 /* ASCII-only string is keeping after truncated. Valid
3094 * and broken may be invalid or valid, leave unknown. */
3096 }
3097 }
3098
3099 STR_SET_LEN(str, len);
3100 TERM_FILL(&RSTRING_PTR(str)[len], termlen);
3101}
3102
3103VALUE
3104rb_str_resize(VALUE str, long len)
3105{
3106 if (len < 0) {
3107 rb_raise(rb_eArgError, "negative string size (or size too big)");
3108 }
3109
3110 int independent = str_independent(str);
3111 long slen = RSTRING_LEN(str);
3112
3113 if (slen > len && ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
3115 }
3116
3117 {
3118 long capa;
3119 const int termlen = TERM_LEN(str);
3120 if (STR_EMBED_P(str)) {
3121 if (len == slen) return str;
3122 if (str_embed_capa(str) >= len + termlen) {
3123 STR_SET_LEN(str, len);
3124 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3125 return str;
3126 }
3127 str_make_independent_expand(str, slen, len - slen, termlen);
3128 }
3129 else if (str_embed_capa(str) >= len + termlen) {
3130 char *ptr = STR_HEAP_PTR(str);
3131 STR_SET_EMBED(str);
3132 if (slen > len) slen = len;
3133 if (slen > 0) MEMCPY(RSTRING(str)->as.embed.ary, ptr, char, slen);
3134 TERM_FILL(RSTRING(str)->as.embed.ary + len, termlen);
3135 STR_SET_LEN(str, len);
3136 if (independent) ruby_xfree(ptr);
3137 return str;
3138 }
3139 else if (!independent) {
3140 if (len == slen) return str;
3141 str_make_independent_expand(str, slen, len - slen, termlen);
3142 }
3143 else if ((capa = RSTRING(str)->as.heap.aux.capa) < len ||
3144 (capa - len) > (len < 1024 ? len : 1024)) {
3145 SIZED_REALLOC_N(RSTRING(str)->as.heap.ptr, char,
3146 (size_t)len + termlen, STR_HEAP_SIZE(str));
3147 RSTRING(str)->as.heap.aux.capa = len;
3148 }
3149 else if (len == slen) return str;
3150 STR_SET_LEN(str, len);
3151 TERM_FILL(RSTRING(str)->as.heap.ptr + len, termlen); /* sentinel */
3152 }
3153 return str;
3154}
3155
3156static VALUE
3157str_buf_cat4(VALUE str, const char *ptr, long len, bool keep_cr)
3158{
3159 if (keep_cr) {
3160 str_modify_keep_cr(str);
3161 }
3162 else {
3163 rb_str_modify(str);
3164 }
3165 if (len == 0) return 0;
3166
3167 long total, olen, off = -1;
3168 char *sptr;
3169 const int termlen = TERM_LEN(str);
3170
3171 RSTRING_GETMEM(str, sptr, olen);
3172 if (ptr >= sptr && ptr <= sptr + olen) {
3173 off = ptr - sptr;
3174 }
3175
3176 long capa = str_capacity(str, termlen);
3177
3178 if (olen > LONG_MAX - len) {
3179 rb_raise(rb_eArgError, "string sizes too big");
3180 }
3181 total = olen + len;
3182 if (capa < total) {
3183 if (total >= LONG_MAX / 2) {
3184 capa = total;
3185 }
3186 while (total > capa) {
3187 capa = 2 * capa + termlen; /* == 2*(capa+termlen)-termlen */
3188 }
3189 RESIZE_CAPA_TERM(str, capa, termlen);
3190 sptr = RSTRING_PTR(str);
3191 }
3192 if (off != -1) {
3193 ptr = sptr + off;
3194 }
3195 memcpy(sptr + olen, ptr, len);
3196 STR_SET_LEN(str, total);
3197 TERM_FILL(sptr + total, termlen); /* sentinel */
3198
3199 return str;
3200}
3201
3202#define str_buf_cat(str, ptr, len) str_buf_cat4((str), (ptr), len, false)
3203#define str_buf_cat2(str, ptr) str_buf_cat4((str), (ptr), rb_strlen_lit(ptr), false)
3204
3205VALUE
3206rb_str_cat(VALUE str, const char *ptr, long len)
3207{
3208 if (len == 0) return str;
3209 if (len < 0) {
3210 rb_raise(rb_eArgError, "negative string size (or size too big)");
3211 }
3212 return str_buf_cat(str, ptr, len);
3213}
3214
3215VALUE
3216rb_str_cat_cstr(VALUE str, const char *ptr)
3217{
3218 must_not_null(ptr);
3219 return rb_str_buf_cat(str, ptr, strlen(ptr));
3220}
3221
3222RUBY_ALIAS_FUNCTION(rb_str_buf_cat(VALUE str, const char *ptr, long len), rb_str_cat, (str, ptr, len))
3223RUBY_ALIAS_FUNCTION(rb_str_buf_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3224RUBY_ALIAS_FUNCTION(rb_str_cat2(VALUE str, const char *ptr), rb_str_cat_cstr, (str, ptr))
3225
3226static VALUE
3227rb_enc_cr_str_buf_cat(VALUE str, const char *ptr, long len,
3228 int ptr_encindex, int ptr_cr, int *ptr_cr_ret)
3229{
3230 int str_encindex = ENCODING_GET(str);
3231 int res_encindex;
3232 int str_cr, res_cr;
3233 rb_encoding *str_enc, *ptr_enc;
3234
3235 str_cr = RSTRING_LEN(str) ? ENC_CODERANGE(str) : ENC_CODERANGE_7BIT;
3236
3237 if (str_encindex == ptr_encindex) {
3238 if (str_cr != ENC_CODERANGE_UNKNOWN && ptr_cr == ENC_CODERANGE_UNKNOWN) {
3239 ptr_cr = coderange_scan(ptr, len, rb_enc_from_index(ptr_encindex));
3240 }
3241 }
3242 else {
3243 str_enc = rb_enc_from_index(str_encindex);
3244 ptr_enc = rb_enc_from_index(ptr_encindex);
3245 if (!rb_enc_asciicompat(str_enc) || !rb_enc_asciicompat(ptr_enc)) {
3246 if (len == 0)
3247 return str;
3248 if (RSTRING_LEN(str) == 0) {
3249 rb_str_buf_cat(str, ptr, len);
3250 ENCODING_CODERANGE_SET(str, ptr_encindex, ptr_cr);
3251 rb_str_change_terminator_length(str, rb_enc_mbminlen(str_enc), rb_enc_mbminlen(ptr_enc));
3252 return str;
3253 }
3254 goto incompatible;
3255 }
3256 if (ptr_cr == ENC_CODERANGE_UNKNOWN) {
3257 ptr_cr = coderange_scan(ptr, len, ptr_enc);
3258 }
3259 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3260 if (ENCODING_IS_ASCII8BIT(str) || ptr_cr != ENC_CODERANGE_7BIT) {
3261 str_cr = rb_enc_str_coderange(str);
3262 }
3263 }
3264 }
3265 if (ptr_cr_ret)
3266 *ptr_cr_ret = ptr_cr;
3267
3268 if (str_encindex != ptr_encindex &&
3269 str_cr != ENC_CODERANGE_7BIT &&
3270 ptr_cr != ENC_CODERANGE_7BIT) {
3271 str_enc = rb_enc_from_index(str_encindex);
3272 ptr_enc = rb_enc_from_index(ptr_encindex);
3273 goto incompatible;
3274 }
3275
3276 if (str_cr == ENC_CODERANGE_UNKNOWN) {
3277 res_encindex = str_encindex;
3278 res_cr = ENC_CODERANGE_UNKNOWN;
3279 }
3280 else if (str_cr == ENC_CODERANGE_7BIT) {
3281 if (ptr_cr == ENC_CODERANGE_7BIT) {
3282 res_encindex = str_encindex;
3283 res_cr = ENC_CODERANGE_7BIT;
3284 }
3285 else {
3286 res_encindex = ptr_encindex;
3287 res_cr = ptr_cr;
3288 }
3289 }
3290 else if (str_cr == ENC_CODERANGE_VALID) {
3291 res_encindex = str_encindex;
3292 if (ENC_CODERANGE_CLEAN_P(ptr_cr))
3293 res_cr = str_cr;
3294 else
3295 res_cr = ptr_cr;
3296 }
3297 else { /* str_cr == ENC_CODERANGE_BROKEN */
3298 res_encindex = str_encindex;
3299 res_cr = str_cr;
3300 if (0 < len) res_cr = ENC_CODERANGE_UNKNOWN;
3301 }
3302
3303 if (len < 0) {
3304 rb_raise(rb_eArgError, "negative string size (or size too big)");
3305 }
3306 str_buf_cat(str, ptr, len);
3307 ENCODING_CODERANGE_SET(str, res_encindex, res_cr);
3308 return str;
3309
3310 incompatible:
3311 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
3312 rb_enc_name(str_enc), rb_enc_name(ptr_enc));
3314}
3315
3316VALUE
3317rb_enc_str_buf_cat(VALUE str, const char *ptr, long len, rb_encoding *ptr_enc)
3318{
3319 return rb_enc_cr_str_buf_cat(str, ptr, len,
3320 rb_enc_to_index(ptr_enc), ENC_CODERANGE_UNKNOWN, NULL);
3321}
3322
3323VALUE
3325{
3326 /* ptr must reference NUL terminated ASCII string. */
3327 int encindex = ENCODING_GET(str);
3328 rb_encoding *enc = rb_enc_from_index(encindex);
3329 if (rb_enc_asciicompat(enc)) {
3330 return rb_enc_cr_str_buf_cat(str, ptr, strlen(ptr),
3331 encindex, ENC_CODERANGE_7BIT, 0);
3332 }
3333 else {
3334 char *buf = ALLOCA_N(char, rb_enc_mbmaxlen(enc));
3335 while (*ptr) {
3336 unsigned int c = (unsigned char)*ptr;
3337 int len = rb_enc_codelen(c, enc);
3338 rb_enc_mbcput(c, buf, enc);
3339 rb_enc_cr_str_buf_cat(str, buf, len,
3340 encindex, ENC_CODERANGE_VALID, 0);
3341 ptr++;
3342 }
3343 return str;
3344 }
3345}
3346
3347VALUE
3348rb_str_buf_append(VALUE str, VALUE str2)
3349{
3350 int str2_cr = rb_enc_str_coderange(str2);
3351
3352 if (str_enc_fastpath(str)) {
3353 switch (str2_cr) {
3354 case ENC_CODERANGE_7BIT:
3355 // If RHS is 7bit we can do simple concatenation
3356 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3357 RB_GC_GUARD(str2);
3358 return str;
3360 // If RHS is valid, we can do simple concatenation if encodings are the same
3361 if (ENCODING_GET_INLINED(str) == ENCODING_GET_INLINED(str2)) {
3362 str_buf_cat4(str, RSTRING_PTR(str2), RSTRING_LEN(str2), true);
3363 int str_cr = ENC_CODERANGE(str);
3364 if (UNLIKELY(str_cr != ENC_CODERANGE_VALID)) {
3365 ENC_CODERANGE_SET(str, RB_ENC_CODERANGE_AND(str_cr, str2_cr));
3366 }
3367 RB_GC_GUARD(str2);
3368 return str;
3369 }
3370 }
3371 }
3372
3373 rb_enc_cr_str_buf_cat(str, RSTRING_PTR(str2), RSTRING_LEN(str2),
3374 ENCODING_GET(str2), str2_cr, &str2_cr);
3375
3376 ENC_CODERANGE_SET(str2, str2_cr);
3377
3378 return str;
3379}
3380
3381VALUE
3383{
3384 StringValue(str2);
3385 return rb_str_buf_append(str, str2);
3386}
3387
3388VALUE
3389rb_str_concat_literals(size_t num, const VALUE *strary)
3390{
3391 VALUE str;
3392 size_t i, s = 0;
3393 unsigned long len = 1;
3394
3395 if (UNLIKELY(!num)) return rb_str_new(0, 0);
3396 if (UNLIKELY(num == 1)) return rb_str_resurrect(strary[0]);
3397
3398 for (i = 0; i < num; ++i) { len += RSTRING_LEN(strary[i]); }
3399 str = rb_str_buf_new(len);
3400 str_enc_copy_direct(str, strary[0]);
3401
3402 for (i = s; i < num; ++i) {
3403 const VALUE v = strary[i];
3404 int encidx = ENCODING_GET(v);
3405
3406 rb_str_buf_append(str, v);
3407 if (encidx != ENCINDEX_US_ASCII) {
3408 if (ENCODING_GET_INLINED(str) == ENCINDEX_US_ASCII)
3409 rb_enc_set_index(str, encidx);
3410 }
3411 }
3412 return str;
3413}
3414
3415/*
3416 * call-seq:
3417 * concat(*objects) -> string
3418 *
3419 * Concatenates each object in +objects+ to +self+ and returns +self+:
3420 *
3421 * s = 'foo'
3422 * s.concat('bar', 'baz') # => "foobarbaz"
3423 * s # => "foobarbaz"
3424 *
3425 * For each given object +object+ that is an Integer,
3426 * the value is considered a codepoint and converted to a character before concatenation:
3427 *
3428 * s = 'foo'
3429 * s.concat(32, 'bar', 32, 'baz') # => "foo bar baz"
3430 *
3431 * Related: String#<<, which takes a single argument.
3432 */
3433static VALUE
3434rb_str_concat_multi(int argc, VALUE *argv, VALUE str)
3435{
3436 str_modifiable(str);
3437
3438 if (argc == 1) {
3439 return rb_str_concat(str, argv[0]);
3440 }
3441 else if (argc > 1) {
3442 int i;
3443 VALUE arg_str = rb_str_tmp_new(0);
3444 rb_enc_copy(arg_str, str);
3445 for (i = 0; i < argc; i++) {
3446 rb_str_concat(arg_str, argv[i]);
3447 }
3448 rb_str_buf_append(str, arg_str);
3449 }
3450
3451 return str;
3452}
3453
3454/*
3455 * call-seq:
3456 * string << object -> string
3457 *
3458 * Concatenates +object+ to +self+ and returns +self+:
3459 *
3460 * s = 'foo'
3461 * s << 'bar' # => "foobar"
3462 * s # => "foobar"
3463 *
3464 * If +object+ is an Integer,
3465 * the value is considered a codepoint and converted to a character before concatenation:
3466 *
3467 * s = 'foo'
3468 * s << 33 # => "foo!"
3469 *
3470 * Related: String#concat, which takes multiple arguments.
3471 */
3472VALUE
3474{
3475 unsigned int code;
3476 rb_encoding *enc = STR_ENC_GET(str1);
3477 int encidx;
3478
3479 if (RB_INTEGER_TYPE_P(str2)) {
3480 if (rb_num_to_uint(str2, &code) == 0) {
3481 }
3482 else if (FIXNUM_P(str2)) {
3483 rb_raise(rb_eRangeError, "%ld out of char range", FIX2LONG(str2));
3484 }
3485 else {
3486 rb_raise(rb_eRangeError, "bignum out of char range");
3487 }
3488 }
3489 else {
3490 return rb_str_append(str1, str2);
3491 }
3492
3493 encidx = rb_ascii8bit_appendable_encoding_index(enc, code);
3494 if (encidx >= 0) {
3495 char buf[1];
3496 buf[0] = (char)code;
3497 rb_str_cat(str1, buf, 1);
3498 if (encidx != rb_enc_to_index(enc)) {
3499 rb_enc_associate_index(str1, encidx);
3501 }
3502 }
3503 else {
3504 long pos = RSTRING_LEN(str1);
3505 int cr = ENC_CODERANGE(str1);
3506 int len;
3507 char *buf;
3508
3509 switch (len = rb_enc_codelen(code, enc)) {
3510 case ONIGERR_INVALID_CODE_POINT_VALUE:
3511 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3512 break;
3513 case ONIGERR_TOO_BIG_WIDE_CHAR_VALUE:
3514 case 0:
3515 rb_raise(rb_eRangeError, "%u out of char range", code);
3516 break;
3517 }
3518 buf = ALLOCA_N(char, len + 1);
3519 rb_enc_mbcput(code, buf, enc);
3520 if (rb_enc_precise_mbclen(buf, buf + len + 1, enc) != len) {
3521 rb_raise(rb_eRangeError, "invalid codepoint 0x%X in %s", code, rb_enc_name(enc));
3522 }
3523 rb_str_resize(str1, pos+len);
3524 memcpy(RSTRING_PTR(str1) + pos, buf, len);
3525 if (cr == ENC_CODERANGE_7BIT && code > 127)
3527 ENC_CODERANGE_SET(str1, cr);
3528 }
3529 return str1;
3530}
3531
3532int
3533rb_ascii8bit_appendable_encoding_index(rb_encoding *enc, unsigned int code)
3534{
3535 int encidx = rb_enc_to_index(enc);
3536
3537 if (encidx == ENCINDEX_ASCII_8BIT || encidx == ENCINDEX_US_ASCII) {
3538 /* US-ASCII automatically extended to ASCII-8BIT */
3539 if (code > 0xFF) {
3540 rb_raise(rb_eRangeError, "%u out of char range", code);
3541 }
3542 if (encidx == ENCINDEX_US_ASCII && code > 127) {
3543 return ENCINDEX_ASCII_8BIT;
3544 }
3545 return encidx;
3546 }
3547 else {
3548 return -1;
3549 }
3550}
3551
3552/*
3553 * call-seq:
3554 * prepend(*other_strings) -> string
3555 *
3556 * Prepends each string in +other_strings+ to +self+ and returns +self+:
3557 *
3558 * s = 'foo'
3559 * s.prepend('bar', 'baz') # => "barbazfoo"
3560 * s # => "barbazfoo"
3561 *
3562 * Related: String#concat.
3563 */
3564
3565static VALUE
3566rb_str_prepend_multi(int argc, VALUE *argv, VALUE str)
3567{
3568 str_modifiable(str);
3569
3570 if (argc == 1) {
3571 rb_str_update(str, 0L, 0L, argv[0]);
3572 }
3573 else if (argc > 1) {
3574 int i;
3575 VALUE arg_str = rb_str_tmp_new(0);
3576 rb_enc_copy(arg_str, str);
3577 for (i = 0; i < argc; i++) {
3578 rb_str_append(arg_str, argv[i]);
3579 }
3580 rb_str_update(str, 0L, 0L, arg_str);
3581 }
3582
3583 return str;
3584}
3585
3586st_index_t
3588{
3589 st_index_t h = rb_memhash((const void *)RSTRING_PTR(str), RSTRING_LEN(str));
3590 int e = RSTRING_LEN(str) ? ENCODING_GET(str) : 0;
3591 if (e && !is_ascii_string(str)) {
3592 h = rb_hash_end(rb_hash_uint32(h, (uint32_t)e));
3593 }
3594 return h;
3595}
3596
3597int
3599{
3600 long len1, len2;
3601 const char *ptr1, *ptr2;
3602 RSTRING_GETMEM(str1, ptr1, len1);
3603 RSTRING_GETMEM(str2, ptr2, len2);
3604 return (len1 != len2 ||
3605 !rb_str_comparable(str1, str2) ||
3606 memcmp(ptr1, ptr2, len1) != 0);
3607}
3608
3609/*
3610 * call-seq:
3611 * hash -> integer
3612 *
3613 * Returns the integer hash value for +self+.
3614 * The value is based on the length, content and encoding of +self+.
3615 *
3616 * Related: Object#hash.
3617 */
3618
3619static VALUE
3620rb_str_hash_m(VALUE str)
3621{
3622 st_index_t hval = rb_str_hash(str);
3623 return ST2FIX(hval);
3624}
3625
3626#define lesser(a,b) (((a)>(b))?(b):(a))
3627
3628int
3630{
3631 int idx1, idx2;
3632 int rc1, rc2;
3633
3634 if (RSTRING_LEN(str1) == 0) return TRUE;
3635 if (RSTRING_LEN(str2) == 0) return TRUE;
3636 idx1 = ENCODING_GET(str1);
3637 idx2 = ENCODING_GET(str2);
3638 if (idx1 == idx2) return TRUE;
3639 rc1 = rb_enc_str_coderange(str1);
3640 rc2 = rb_enc_str_coderange(str2);
3641 if (rc1 == ENC_CODERANGE_7BIT) {
3642 if (rc2 == ENC_CODERANGE_7BIT) return TRUE;
3643 if (rb_enc_asciicompat(rb_enc_from_index(idx2)))
3644 return TRUE;
3645 }
3646 if (rc2 == ENC_CODERANGE_7BIT) {
3647 if (rb_enc_asciicompat(rb_enc_from_index(idx1)))
3648 return TRUE;
3649 }
3650 return FALSE;
3651}
3652
3653int
3655{
3656 long len1, len2;
3657 const char *ptr1, *ptr2;
3658 int retval;
3659
3660 if (str1 == str2) return 0;
3661 RSTRING_GETMEM(str1, ptr1, len1);
3662 RSTRING_GETMEM(str2, ptr2, len2);
3663 if (ptr1 == ptr2 || (retval = memcmp(ptr1, ptr2, lesser(len1, len2))) == 0) {
3664 if (len1 == len2) {
3665 if (!rb_str_comparable(str1, str2)) {
3666 if (ENCODING_GET(str1) > ENCODING_GET(str2))
3667 return 1;
3668 return -1;
3669 }
3670 return 0;
3671 }
3672 if (len1 > len2) return 1;
3673 return -1;
3674 }
3675 if (retval > 0) return 1;
3676 return -1;
3677}
3678
3679/*
3680 * call-seq:
3681 * string == object -> true or false
3682 * string === object -> true or false
3683 *
3684 * Returns +true+ if +object+ has the same length and content;
3685 * as +self+; +false+ otherwise:
3686 *
3687 * s = 'foo'
3688 * s == 'foo' # => true
3689 * s == 'food' # => false
3690 * s == 'FOO' # => false
3691 *
3692 * Returns +false+ if the two strings' encodings are not compatible:
3693 * "\u{e4 f6 fc}".encode("ISO-8859-1") == ("\u{c4 d6 dc}") # => false
3694 *
3695 * If +object+ is not an instance of \String but responds to +to_str+, then the
3696 * two strings are compared using <code>object.==</code>.
3697 */
3698
3699VALUE
3701{
3702 if (str1 == str2) return Qtrue;
3703 if (!RB_TYPE_P(str2, T_STRING)) {
3704 if (!rb_respond_to(str2, idTo_str)) {
3705 return Qfalse;
3706 }
3707 return rb_equal(str2, str1);
3708 }
3709 return rb_str_eql_internal(str1, str2);
3710}
3711
3712/*
3713 * call-seq:
3714 * eql?(object) -> true or false
3715 *
3716 * Returns +true+ if +object+ has the same length and content;
3717 * as +self+; +false+ otherwise:
3718 *
3719 * s = 'foo'
3720 * s.eql?('foo') # => true
3721 * s.eql?('food') # => false
3722 * s.eql?('FOO') # => false
3723 *
3724 * Returns +false+ if the two strings' encodings are not compatible:
3725 *
3726 * "\u{e4 f6 fc}".encode("ISO-8859-1").eql?("\u{c4 d6 dc}") # => false
3727 *
3728 */
3729
3730VALUE
3731rb_str_eql(VALUE str1, VALUE str2)
3732{
3733 if (str1 == str2) return Qtrue;
3734 if (!RB_TYPE_P(str2, T_STRING)) return Qfalse;
3735 return rb_str_eql_internal(str1, str2);
3736}
3737
3738/*
3739 * call-seq:
3740 * string <=> other_string -> -1, 0, 1, or nil
3741 *
3742 * Compares +self+ and +other_string+, returning:
3743 *
3744 * - -1 if +other_string+ is larger.
3745 * - 0 if the two are equal.
3746 * - 1 if +other_string+ is smaller.
3747 * - +nil+ if the two are incomparable.
3748 *
3749 * Examples:
3750 *
3751 * 'foo' <=> 'foo' # => 0
3752 * 'foo' <=> 'food' # => -1
3753 * 'food' <=> 'foo' # => 1
3754 * 'FOO' <=> 'foo' # => -1
3755 * 'foo' <=> 'FOO' # => 1
3756 * 'foo' <=> 1 # => nil
3757 *
3758 */
3759
3760static VALUE
3761rb_str_cmp_m(VALUE str1, VALUE str2)
3762{
3763 int result;
3764 VALUE s = rb_check_string_type(str2);
3765 if (NIL_P(s)) {
3766 return rb_invcmp(str1, str2);
3767 }
3768 result = rb_str_cmp(str1, s);
3769 return INT2FIX(result);
3770}
3771
3772static VALUE str_casecmp(VALUE str1, VALUE str2);
3773static VALUE str_casecmp_p(VALUE str1, VALUE str2);
3774
3775/*
3776 * call-seq:
3777 * casecmp(other_string) -> -1, 0, 1, or nil
3778 *
3779 * Compares <tt>self.downcase</tt> and <tt>other_string.downcase</tt>; returns:
3780 *
3781 * - -1 if <tt>other_string.downcase</tt> is larger.
3782 * - 0 if the two are equal.
3783 * - 1 if <tt>other_string.downcase</tt> is smaller.
3784 * - +nil+ if the two are incomparable.
3785 *
3786 * Examples:
3787 *
3788 * 'foo'.casecmp('foo') # => 0
3789 * 'foo'.casecmp('food') # => -1
3790 * 'food'.casecmp('foo') # => 1
3791 * 'FOO'.casecmp('foo') # => 0
3792 * 'foo'.casecmp('FOO') # => 0
3793 * 'foo'.casecmp(1) # => nil
3794 *
3795 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3796 *
3797 * Related: String#casecmp?.
3798 *
3799 */
3800
3801static VALUE
3802rb_str_casecmp(VALUE str1, VALUE str2)
3803{
3804 VALUE s = rb_check_string_type(str2);
3805 if (NIL_P(s)) {
3806 return Qnil;
3807 }
3808 return str_casecmp(str1, s);
3809}
3810
3811static VALUE
3812str_casecmp(VALUE str1, VALUE str2)
3813{
3814 long len;
3815 rb_encoding *enc;
3816 const char *p1, *p1end, *p2, *p2end;
3817
3818 enc = rb_enc_compatible(str1, str2);
3819 if (!enc) {
3820 return Qnil;
3821 }
3822
3823 p1 = RSTRING_PTR(str1); p1end = RSTRING_END(str1);
3824 p2 = RSTRING_PTR(str2); p2end = RSTRING_END(str2);
3825 if (single_byte_optimizable(str1) && single_byte_optimizable(str2)) {
3826 while (p1 < p1end && p2 < p2end) {
3827 if (*p1 != *p2) {
3828 unsigned int c1 = TOLOWER(*p1 & 0xff);
3829 unsigned int c2 = TOLOWER(*p2 & 0xff);
3830 if (c1 != c2)
3831 return INT2FIX(c1 < c2 ? -1 : 1);
3832 }
3833 p1++;
3834 p2++;
3835 }
3836 }
3837 else {
3838 while (p1 < p1end && p2 < p2end) {
3839 int l1, c1 = rb_enc_ascget(p1, p1end, &l1, enc);
3840 int l2, c2 = rb_enc_ascget(p2, p2end, &l2, enc);
3841
3842 if (0 <= c1 && 0 <= c2) {
3843 c1 = TOLOWER(c1);
3844 c2 = TOLOWER(c2);
3845 if (c1 != c2)
3846 return INT2FIX(c1 < c2 ? -1 : 1);
3847 }
3848 else {
3849 int r;
3850 l1 = rb_enc_mbclen(p1, p1end, enc);
3851 l2 = rb_enc_mbclen(p2, p2end, enc);
3852 len = l1 < l2 ? l1 : l2;
3853 r = memcmp(p1, p2, len);
3854 if (r != 0)
3855 return INT2FIX(r < 0 ? -1 : 1);
3856 if (l1 != l2)
3857 return INT2FIX(l1 < l2 ? -1 : 1);
3858 }
3859 p1 += l1;
3860 p2 += l2;
3861 }
3862 }
3863 if (RSTRING_LEN(str1) == RSTRING_LEN(str2)) return INT2FIX(0);
3864 if (RSTRING_LEN(str1) > RSTRING_LEN(str2)) return INT2FIX(1);
3865 return INT2FIX(-1);
3866}
3867
3868/*
3869 * call-seq:
3870 * casecmp?(other_string) -> true, false, or nil
3871 *
3872 * Returns +true+ if +self+ and +other_string+ are equal after
3873 * Unicode case folding, otherwise +false+:
3874 *
3875 * 'foo'.casecmp?('foo') # => true
3876 * 'foo'.casecmp?('food') # => false
3877 * 'food'.casecmp?('foo') # => false
3878 * 'FOO'.casecmp?('foo') # => true
3879 * 'foo'.casecmp?('FOO') # => true
3880 *
3881 * Returns +nil+ if the two values are incomparable:
3882 *
3883 * 'foo'.casecmp?(1) # => nil
3884 *
3885 * See {Case Mapping}[rdoc-ref:case_mapping.rdoc].
3886 *
3887 * Related: String#casecmp.
3888 *
3889 */
3890
3891static VALUE
3892rb_str_casecmp_p(VALUE str1, VALUE str2)
3893{
3894 VALUE s = rb_check_string_type(str2);
3895 if (NIL_P(s)) {
3896 return Qnil;
3897 }
3898 return str_casecmp_p(str1, s);
3899}
3900
3901static VALUE
3902str_casecmp_p(VALUE str1, VALUE str2)
3903{
3904 rb_encoding *enc;
3905 VALUE folded_str1, folded_str2;
3906 VALUE fold_opt = sym_fold;
3907
3908 enc = rb_enc_compatible(str1, str2);
3909 if (!enc) {
3910 return Qnil;
3911 }
3912
3913 folded_str1 = rb_str_downcase(1, &fold_opt, str1);
3914 folded_str2 = rb_str_downcase(1, &fold_opt, str2);
3915
3916 return rb_str_eql(folded_str1, folded_str2);
3917}
3918
3919static long
3920strseq_core(const char *str_ptr, const char *str_ptr_end, long str_len,
3921 const char *sub_ptr, long sub_len, long offset, rb_encoding *enc)
3922{
3923 const char *search_start = str_ptr;
3924 long pos, search_len = str_len - offset;
3925
3926 for (;;) {
3927 const char *t;
3928 pos = rb_memsearch(sub_ptr, sub_len, search_start, search_len, enc);
3929 if (pos < 0) return pos;
3930 t = rb_enc_right_char_head(search_start, search_start+pos, str_ptr_end, enc);
3931 if (t == search_start + pos) break;
3932 search_len -= t - search_start;
3933 if (search_len <= 0) return -1;
3934 offset += t - search_start;
3935 search_start = t;
3936 }
3937 return pos + offset;
3938}
3939
3940/* found index in byte */
3941#define rb_str_index(str, sub, offset) rb_strseq_index(str, sub, offset, 0)
3942#define rb_str_byteindex(str, sub, offset) rb_strseq_index(str, sub, offset, 1)
3943
3944static long
3945rb_strseq_index(VALUE str, VALUE sub, long offset, int in_byte)
3946{
3947 const char *str_ptr, *str_ptr_end, *sub_ptr;
3948 long str_len, sub_len;
3949 rb_encoding *enc;
3950
3951 enc = rb_enc_check(str, sub);
3952 if (is_broken_string(sub)) return -1;
3953
3954 str_ptr = RSTRING_PTR(str);
3955 str_ptr_end = RSTRING_END(str);
3956 str_len = RSTRING_LEN(str);
3957 sub_ptr = RSTRING_PTR(sub);
3958 sub_len = RSTRING_LEN(sub);
3959
3960 if (str_len < sub_len) return -1;
3961
3962 if (offset != 0) {
3963 long str_len_char, sub_len_char;
3964 int single_byte = single_byte_optimizable(str);
3965 str_len_char = (in_byte || single_byte) ? str_len : str_strlen(str, enc);
3966 sub_len_char = in_byte ? sub_len : str_strlen(sub, enc);
3967 if (offset < 0) {
3968 offset += str_len_char;
3969 if (offset < 0) return -1;
3970 }
3971 if (str_len_char - offset < sub_len_char) return -1;
3972 if (!in_byte) offset = str_offset(str_ptr, str_ptr_end, offset, enc, single_byte);
3973 str_ptr += offset;
3974 }
3975 if (sub_len == 0) return offset;
3976
3977 /* need proceed one character at a time */
3978 return strseq_core(str_ptr, str_ptr_end, str_len, sub_ptr, sub_len, offset, enc);
3979}
3980
3981
3982/*
3983 * call-seq:
3984 * index(substring, offset = 0) -> integer or nil
3985 * index(regexp, offset = 0) -> integer or nil
3986 *
3987 * :include: doc/string/index.rdoc
3988 *
3989 */
3990
3991static VALUE
3992rb_str_index_m(int argc, VALUE *argv, VALUE str)
3993{
3994 VALUE sub;
3995 VALUE initpos;
3996 rb_encoding *enc = STR_ENC_GET(str);
3997 long pos;
3998
3999 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4000 long slen = str_strlen(str, enc); /* str's enc */
4001 pos = NUM2LONG(initpos);
4002 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4003 if (RB_TYPE_P(sub, T_REGEXP)) {
4005 }
4006 return Qnil;
4007 }
4008 }
4009 else {
4010 pos = 0;
4011 }
4012
4013 if (RB_TYPE_P(sub, T_REGEXP)) {
4014 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4015 enc, single_byte_optimizable(str));
4016
4017 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4018 VALUE match = rb_backref_get();
4019 struct re_registers *regs = RMATCH_REGS(match);
4020 pos = rb_str_sublen(str, BEG(0));
4021 return LONG2NUM(pos);
4022 }
4023 }
4024 else {
4025 StringValue(sub);
4026 pos = rb_str_index(str, sub, pos);
4027 if (pos >= 0) {
4028 pos = rb_str_sublen(str, pos);
4029 return LONG2NUM(pos);
4030 }
4031 }
4032 return Qnil;
4033}
4034
4035/* Ensure that the given pos is a valid character boundary.
4036 * Note that in this function, "character" means a code point
4037 * (Unicode scalar value), not a grapheme cluster.
4038 */
4039static void
4040str_ensure_byte_pos(VALUE str, long pos)
4041{
4042 const char *s = RSTRING_PTR(str);
4043 const char *e = RSTRING_END(str);
4044 const char *p = s + pos;
4045 if (!at_char_boundary(s, p, e, rb_enc_get(str))) {
4046 rb_raise(rb_eIndexError,
4047 "offset %ld does not land on character boundary", pos);
4048 }
4049}
4050
4051/*
4052 * call-seq:
4053 * byteindex(substring, offset = 0) -> integer or nil
4054 * byteindex(regexp, offset = 0) -> integer or nil
4055 *
4056 * Returns the Integer byte-based index of the first occurrence of the given +substring+,
4057 * or +nil+ if none found:
4058 *
4059 * 'foo'.byteindex('f') # => 0
4060 * 'foo'.byteindex('o') # => 1
4061 * 'foo'.byteindex('oo') # => 1
4062 * 'foo'.byteindex('ooo') # => nil
4063 *
4064 * Returns the Integer byte-based index of the first match for the given Regexp +regexp+,
4065 * or +nil+ if none found:
4066 *
4067 * 'foo'.byteindex(/f/) # => 0
4068 * 'foo'.byteindex(/o/) # => 1
4069 * 'foo'.byteindex(/oo/) # => 1
4070 * 'foo'.byteindex(/ooo/) # => nil
4071 *
4072 * Integer argument +offset+, if given, specifies the byte-based position in the
4073 * string to begin the search:
4074 *
4075 * 'foo'.byteindex('o', 1) # => 1
4076 * 'foo'.byteindex('o', 2) # => 2
4077 * 'foo'.byteindex('o', 3) # => nil
4078 *
4079 * If +offset+ is negative, counts backward from the end of +self+:
4080 *
4081 * 'foo'.byteindex('o', -1) # => 2
4082 * 'foo'.byteindex('o', -2) # => 1
4083 * 'foo'.byteindex('o', -3) # => 1
4084 * 'foo'.byteindex('o', -4) # => nil
4085 *
4086 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4087 * raised.
4088 *
4089 * Related: String#index, String#byterindex.
4090 */
4091
4092static VALUE
4093rb_str_byteindex_m(int argc, VALUE *argv, VALUE str)
4094{
4095 VALUE sub;
4096 VALUE initpos;
4097 long pos;
4098
4099 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4100 long slen = RSTRING_LEN(str);
4101 pos = NUM2LONG(initpos);
4102 if (pos < 0 ? (pos += slen) < 0 : pos > slen) {
4103 if (RB_TYPE_P(sub, T_REGEXP)) {
4105 }
4106 return Qnil;
4107 }
4108 }
4109 else {
4110 pos = 0;
4111 }
4112
4113 str_ensure_byte_pos(str, pos);
4114
4115 if (RB_TYPE_P(sub, T_REGEXP)) {
4116 if (rb_reg_search(sub, str, pos, 0) >= 0) {
4117 VALUE match = rb_backref_get();
4118 struct re_registers *regs = RMATCH_REGS(match);
4119 pos = BEG(0);
4120 return LONG2NUM(pos);
4121 }
4122 }
4123 else {
4124 StringValue(sub);
4125 pos = rb_str_byteindex(str, sub, pos);
4126 if (pos >= 0) return LONG2NUM(pos);
4127 }
4128 return Qnil;
4129}
4130
4131#ifdef HAVE_MEMRCHR
4132static long
4133str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4134{
4135 char *hit, *adjusted;
4136 int c;
4137 long slen, searchlen;
4138 char *sbeg, *e, *t;
4139
4140 sbeg = RSTRING_PTR(str);
4141 slen = RSTRING_LEN(sub);
4142 if (slen == 0) return s - sbeg;
4143 e = RSTRING_END(str);
4144 t = RSTRING_PTR(sub);
4145 c = *t & 0xff;
4146 searchlen = s - sbeg + 1;
4147
4148 do {
4149 hit = memrchr(sbeg, c, searchlen);
4150 if (!hit) break;
4151 adjusted = rb_enc_left_char_head(sbeg, hit, e, enc);
4152 if (hit != adjusted) {
4153 searchlen = adjusted - sbeg;
4154 continue;
4155 }
4156 if (memcmp(hit, t, slen) == 0)
4157 return hit - sbeg;
4158 searchlen = adjusted - sbeg;
4159 } while (searchlen > 0);
4160
4161 return -1;
4162}
4163#else
4164static long
4165str_rindex(VALUE str, VALUE sub, const char *s, rb_encoding *enc)
4166{
4167 long slen;
4168 char *sbeg, *e, *t;
4169
4170 sbeg = RSTRING_PTR(str);
4171 e = RSTRING_END(str);
4172 t = RSTRING_PTR(sub);
4173 slen = RSTRING_LEN(sub);
4174
4175 while (s) {
4176 if (memcmp(s, t, slen) == 0) {
4177 return s - sbeg;
4178 }
4179 if (s <= sbeg) break;
4180 s = rb_enc_prev_char(sbeg, s, e, enc);
4181 }
4182
4183 return -1;
4184}
4185#endif
4186
4187/* found index in byte */
4188static long
4189rb_str_rindex(VALUE str, VALUE sub, long pos)
4190{
4191 long len, slen;
4192 char *sbeg, *s;
4193 rb_encoding *enc;
4194 int singlebyte;
4195
4196 enc = rb_enc_check(str, sub);
4197 if (is_broken_string(sub)) return -1;
4198 singlebyte = single_byte_optimizable(str);
4199 len = singlebyte ? RSTRING_LEN(str) : str_strlen(str, enc); /* rb_enc_check */
4200 slen = str_strlen(sub, enc); /* rb_enc_check */
4201
4202 /* substring longer than string */
4203 if (len < slen) return -1;
4204 if (len - pos < slen) pos = len - slen;
4205 if (len == 0) return pos;
4206
4207 sbeg = RSTRING_PTR(str);
4208
4209 if (pos == 0) {
4210 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4211 return 0;
4212 else
4213 return -1;
4214 }
4215
4216 s = str_nth(sbeg, RSTRING_END(str), pos, enc, singlebyte);
4217 return str_rindex(str, sub, s, enc);
4218}
4219
4220/*
4221 * call-seq:
4222 * rindex(substring, offset = self.length) -> integer or nil
4223 * rindex(regexp, offset = self.length) -> integer or nil
4224 *
4225 * Returns the Integer index of the _last_ occurrence of the given +substring+,
4226 * or +nil+ if none found:
4227 *
4228 * 'foo'.rindex('f') # => 0
4229 * 'foo'.rindex('o') # => 2
4230 * 'foo'.rindex('oo') # => 1
4231 * 'foo'.rindex('ooo') # => nil
4232 *
4233 * Returns the Integer index of the _last_ match for the given Regexp +regexp+,
4234 * or +nil+ if none found:
4235 *
4236 * 'foo'.rindex(/f/) # => 0
4237 * 'foo'.rindex(/o/) # => 2
4238 * 'foo'.rindex(/oo/) # => 1
4239 * 'foo'.rindex(/ooo/) # => nil
4240 *
4241 * The _last_ match means starting at the possible last position, not
4242 * the last of longest matches.
4243 *
4244 * 'foo'.rindex(/o+/) # => 2
4245 * $~ #=> #<MatchData "o">
4246 *
4247 * To get the last longest match, needs to combine with negative
4248 * lookbehind.
4249 *
4250 * 'foo'.rindex(/(?<!o)o+/) # => 1
4251 * $~ #=> #<MatchData "oo">
4252 *
4253 * Or String#index with negative lookforward.
4254 *
4255 * 'foo'.index(/o+(?!.*o)/) # => 1
4256 * $~ #=> #<MatchData "oo">
4257 *
4258 * Integer argument +offset+, if given and non-negative, specifies the maximum starting position in the
4259 * string to _end_ the search:
4260 *
4261 * 'foo'.rindex('o', 0) # => nil
4262 * 'foo'.rindex('o', 1) # => 1
4263 * 'foo'.rindex('o', 2) # => 2
4264 * 'foo'.rindex('o', 3) # => 2
4265 *
4266 * If +offset+ is a negative Integer, the maximum starting position in the
4267 * string to _end_ the search is the sum of the string's length and +offset+:
4268 *
4269 * 'foo'.rindex('o', -1) # => 2
4270 * 'foo'.rindex('o', -2) # => 1
4271 * 'foo'.rindex('o', -3) # => nil
4272 * 'foo'.rindex('o', -4) # => nil
4273 *
4274 * Related: String#index.
4275 */
4276
4277static VALUE
4278rb_str_rindex_m(int argc, VALUE *argv, VALUE str)
4279{
4280 VALUE sub;
4281 VALUE initpos;
4282 rb_encoding *enc = STR_ENC_GET(str);
4283 long pos, len = str_strlen(str, enc); /* str's enc */
4284
4285 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4286 pos = NUM2LONG(initpos);
4287 if (pos < 0 && (pos += len) < 0) {
4288 if (RB_TYPE_P(sub, T_REGEXP)) {
4290 }
4291 return Qnil;
4292 }
4293 if (pos > len) pos = len;
4294 }
4295 else {
4296 pos = len;
4297 }
4298
4299 if (RB_TYPE_P(sub, T_REGEXP)) {
4300 /* enc = rb_enc_check(str, sub); */
4301 pos = str_offset(RSTRING_PTR(str), RSTRING_END(str), pos,
4302 enc, single_byte_optimizable(str));
4303
4304 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4305 VALUE match = rb_backref_get();
4306 struct re_registers *regs = RMATCH_REGS(match);
4307 pos = rb_str_sublen(str, BEG(0));
4308 return LONG2NUM(pos);
4309 }
4310 }
4311 else {
4312 StringValue(sub);
4313 pos = rb_str_rindex(str, sub, pos);
4314 if (pos >= 0) {
4315 pos = rb_str_sublen(str, pos);
4316 return LONG2NUM(pos);
4317 }
4318 }
4319 return Qnil;
4320}
4321
4322static long
4323rb_str_byterindex(VALUE str, VALUE sub, long pos)
4324{
4325 long len, slen;
4326 char *sbeg, *s;
4327 rb_encoding *enc;
4328
4329 enc = rb_enc_check(str, sub);
4330 if (is_broken_string(sub)) return -1;
4331 len = RSTRING_LEN(str);
4332 slen = RSTRING_LEN(sub);
4333
4334 /* substring longer than string */
4335 if (len < slen) return -1;
4336 if (len - pos < slen) pos = len - slen;
4337 if (len == 0) return pos;
4338
4339 sbeg = RSTRING_PTR(str);
4340
4341 if (pos == 0) {
4342 if (memcmp(sbeg, RSTRING_PTR(sub), RSTRING_LEN(sub)) == 0)
4343 return 0;
4344 else
4345 return -1;
4346 }
4347
4348 s = sbeg + pos;
4349 return str_rindex(str, sub, s, enc);
4350}
4351
4352
4353/*
4354 * call-seq:
4355 * byterindex(substring, offset = self.bytesize) -> integer or nil
4356 * byterindex(regexp, offset = self.bytesize) -> integer or nil
4357 *
4358 * Returns the Integer byte-based index of the _last_ occurrence of the given +substring+,
4359 * or +nil+ if none found:
4360 *
4361 * 'foo'.byterindex('f') # => 0
4362 * 'foo'.byterindex('o') # => 2
4363 * 'foo'.byterindex('oo') # => 1
4364 * 'foo'.byterindex('ooo') # => nil
4365 *
4366 * Returns the Integer byte-based index of the _last_ match for the given Regexp +regexp+,
4367 * or +nil+ if none found:
4368 *
4369 * 'foo'.byterindex(/f/) # => 0
4370 * 'foo'.byterindex(/o/) # => 2
4371 * 'foo'.byterindex(/oo/) # => 1
4372 * 'foo'.byterindex(/ooo/) # => nil
4373 *
4374 * The _last_ match means starting at the possible last position, not
4375 * the last of longest matches.
4376 *
4377 * 'foo'.byterindex(/o+/) # => 2
4378 * $~ #=> #<MatchData "o">
4379 *
4380 * To get the last longest match, needs to combine with negative
4381 * lookbehind.
4382 *
4383 * 'foo'.byterindex(/(?<!o)o+/) # => 1
4384 * $~ #=> #<MatchData "oo">
4385 *
4386 * Or String#byteindex with negative lookforward.
4387 *
4388 * 'foo'.byteindex(/o+(?!.*o)/) # => 1
4389 * $~ #=> #<MatchData "oo">
4390 *
4391 * Integer argument +offset+, if given and non-negative, specifies the maximum starting byte-based position in the
4392 * string to _end_ the search:
4393 *
4394 * 'foo'.byterindex('o', 0) # => nil
4395 * 'foo'.byterindex('o', 1) # => 1
4396 * 'foo'.byterindex('o', 2) # => 2
4397 * 'foo'.byterindex('o', 3) # => 2
4398 *
4399 * If +offset+ is a negative Integer, the maximum starting position in the
4400 * string to _end_ the search is the sum of the string's length and +offset+:
4401 *
4402 * 'foo'.byterindex('o', -1) # => 2
4403 * 'foo'.byterindex('o', -2) # => 1
4404 * 'foo'.byterindex('o', -3) # => nil
4405 * 'foo'.byterindex('o', -4) # => nil
4406 *
4407 * If +offset+ does not land on character (codepoint) boundary, +IndexError+ is
4408 * raised.
4409 *
4410 * Related: String#byteindex.
4411 */
4412
4413static VALUE
4414rb_str_byterindex_m(int argc, VALUE *argv, VALUE str)
4415{
4416 VALUE sub;
4417 VALUE initpos;
4418 long pos, len = RSTRING_LEN(str);
4419
4420 if (rb_scan_args(argc, argv, "11", &sub, &initpos) == 2) {
4421 pos = NUM2LONG(initpos);
4422 if (pos < 0 && (pos += len) < 0) {
4423 if (RB_TYPE_P(sub, T_REGEXP)) {
4425 }
4426 return Qnil;
4427 }
4428 if (pos > len) pos = len;
4429 }
4430 else {
4431 pos = len;
4432 }
4433
4434 str_ensure_byte_pos(str, pos);
4435
4436 if (RB_TYPE_P(sub, T_REGEXP)) {
4437 if (rb_reg_search(sub, str, pos, 1) >= 0) {
4438 VALUE match = rb_backref_get();
4439 struct re_registers *regs = RMATCH_REGS(match);
4440 pos = BEG(0);
4441 return LONG2NUM(pos);
4442 }
4443 }
4444 else {
4445 StringValue(sub);
4446 pos = rb_str_byterindex(str, sub, pos);
4447 if (pos >= 0) return LONG2NUM(pos);
4448 }
4449 return Qnil;
4450}
4451
4452/*
4453 * call-seq:
4454 * string =~ regexp -> integer or nil
4455 * string =~ object -> integer or nil
4456 *
4457 * Returns the Integer index of the first substring that matches
4458 * the given +regexp+, or +nil+ if no match found:
4459 *
4460 * 'foo' =~ /f/ # => 0
4461 * 'foo' =~ /o/ # => 1
4462 * 'foo' =~ /x/ # => nil
4463 *
4464 * Note: also updates Regexp@Global+Variables.
4465 *
4466 * If the given +object+ is not a Regexp, returns the value
4467 * returned by <tt>object =~ self</tt>.
4468 *
4469 * Note that <tt>string =~ regexp</tt> is different from <tt>regexp =~ string</tt>
4470 * (see Regexp#=~):
4471 *
4472 * number= nil
4473 * "no. 9" =~ /(?<number>\d+)/
4474 * number # => nil (not assigned)
4475 * /(?<number>\d+)/ =~ "no. 9"
4476 * number #=> "9"
4477 *
4478 */
4479
4480static VALUE
4481rb_str_match(VALUE x, VALUE y)
4482{
4483 switch (OBJ_BUILTIN_TYPE(y)) {
4484 case T_STRING:
4485 rb_raise(rb_eTypeError, "type mismatch: String given");
4486
4487 case T_REGEXP:
4488 return rb_reg_match(y, x);
4489
4490 default:
4491 return rb_funcall(y, idEqTilde, 1, x);
4492 }
4493}
4494
4495
4496static VALUE get_pat(VALUE);
4497
4498
4499/*
4500 * call-seq:
4501 * match(pattern, offset = 0) -> matchdata or nil
4502 * match(pattern, offset = 0) {|matchdata| ... } -> object
4503 *
4504 * Returns a MatchData object (or +nil+) based on +self+ and the given +pattern+.
4505 *
4506 * Note: also updates Regexp@Global+Variables.
4507 *
4508 * - Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4509 * regexp = Regexp.new(pattern)
4510 * - Computes +matchdata+, which will be either a MatchData object or +nil+
4511 * (see Regexp#match):
4512 * matchdata = <tt>regexp.match(self)
4513 *
4514 * With no block given, returns the computed +matchdata+:
4515 *
4516 * 'foo'.match('f') # => #<MatchData "f">
4517 * 'foo'.match('o') # => #<MatchData "o">
4518 * 'foo'.match('x') # => nil
4519 *
4520 * If Integer argument +offset+ is given, the search begins at index +offset+:
4521 *
4522 * 'foo'.match('f', 1) # => nil
4523 * 'foo'.match('o', 1) # => #<MatchData "o">
4524 *
4525 * With a block given, calls the block with the computed +matchdata+
4526 * and returns the block's return value:
4527 *
4528 * 'foo'.match(/o/) {|matchdata| matchdata } # => #<MatchData "o">
4529 * 'foo'.match(/x/) {|matchdata| matchdata } # => nil
4530 * 'foo'.match(/f/, 1) {|matchdata| matchdata } # => nil
4531 *
4532 */
4533
4534static VALUE
4535rb_str_match_m(int argc, VALUE *argv, VALUE str)
4536{
4537 VALUE re, result;
4538 if (argc < 1)
4539 rb_check_arity(argc, 1, 2);
4540 re = argv[0];
4541 argv[0] = str;
4542 result = rb_funcallv(get_pat(re), rb_intern("match"), argc, argv);
4543 if (!NIL_P(result) && rb_block_given_p()) {
4544 return rb_yield(result);
4545 }
4546 return result;
4547}
4548
4549/*
4550 * call-seq:
4551 * match?(pattern, offset = 0) -> true or false
4552 *
4553 * Returns +true+ or +false+ based on whether a match is found for +self+ and +pattern+.
4554 *
4555 * Note: does not update Regexp@Global+Variables.
4556 *
4557 * Computes +regexp+ by converting +pattern+ (if not already a Regexp).
4558 * regexp = Regexp.new(pattern)
4559 *
4560 * Returns +true+ if <tt>self+.match(regexp)</tt> returns a MatchData object,
4561 * +false+ otherwise:
4562 *
4563 * 'foo'.match?(/o/) # => true
4564 * 'foo'.match?('o') # => true
4565 * 'foo'.match?(/x/) # => false
4566 *
4567 * If Integer argument +offset+ is given, the search begins at index +offset+:
4568 * 'foo'.match?('f', 1) # => false
4569 * 'foo'.match?('o', 1) # => true
4570 *
4571 */
4572
4573static VALUE
4574rb_str_match_m_p(int argc, VALUE *argv, VALUE str)
4575{
4576 VALUE re;
4577 rb_check_arity(argc, 1, 2);
4578 re = get_pat(argv[0]);
4579 return rb_reg_match_p(re, str, argc > 1 ? NUM2LONG(argv[1]) : 0);
4580}
4581
4582enum neighbor_char {
4583 NEIGHBOR_NOT_CHAR,
4584 NEIGHBOR_FOUND,
4585 NEIGHBOR_WRAPPED
4586};
4587
4588static enum neighbor_char
4589enc_succ_char(char *p, long len, rb_encoding *enc)
4590{
4591 long i;
4592 int l;
4593
4594 if (rb_enc_mbminlen(enc) > 1) {
4595 /* wchar, trivial case */
4596 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4597 if (!MBCLEN_CHARFOUND_P(r)) {
4598 return NEIGHBOR_NOT_CHAR;
4599 }
4600 c = rb_enc_mbc_to_codepoint(p, p + len, enc) + 1;
4601 l = rb_enc_code_to_mbclen(c, enc);
4602 if (!l) return NEIGHBOR_NOT_CHAR;
4603 if (l != len) return NEIGHBOR_WRAPPED;
4604 rb_enc_mbcput(c, p, enc);
4605 r = rb_enc_precise_mbclen(p, p + len, enc);
4606 if (!MBCLEN_CHARFOUND_P(r)) {
4607 return NEIGHBOR_NOT_CHAR;
4608 }
4609 return NEIGHBOR_FOUND;
4610 }
4611 while (1) {
4612 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0xff; i--)
4613 p[i] = '\0';
4614 if (i < 0)
4615 return NEIGHBOR_WRAPPED;
4616 ++((unsigned char*)p)[i];
4617 l = rb_enc_precise_mbclen(p, p+len, enc);
4618 if (MBCLEN_CHARFOUND_P(l)) {
4619 l = MBCLEN_CHARFOUND_LEN(l);
4620 if (l == len) {
4621 return NEIGHBOR_FOUND;
4622 }
4623 else {
4624 memset(p+l, 0xff, len-l);
4625 }
4626 }
4627 if (MBCLEN_INVALID_P(l) && i < len-1) {
4628 long len2;
4629 int l2;
4630 for (len2 = len-1; 0 < len2; len2--) {
4631 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4632 if (!MBCLEN_INVALID_P(l2))
4633 break;
4634 }
4635 memset(p+len2+1, 0xff, len-(len2+1));
4636 }
4637 }
4638}
4639
4640static enum neighbor_char
4641enc_pred_char(char *p, long len, rb_encoding *enc)
4642{
4643 long i;
4644 int l;
4645 if (rb_enc_mbminlen(enc) > 1) {
4646 /* wchar, trivial case */
4647 int r = rb_enc_precise_mbclen(p, p + len, enc), c;
4648 if (!MBCLEN_CHARFOUND_P(r)) {
4649 return NEIGHBOR_NOT_CHAR;
4650 }
4651 c = rb_enc_mbc_to_codepoint(p, p + len, enc);
4652 if (!c) return NEIGHBOR_NOT_CHAR;
4653 --c;
4654 l = rb_enc_code_to_mbclen(c, enc);
4655 if (!l) return NEIGHBOR_NOT_CHAR;
4656 if (l != len) return NEIGHBOR_WRAPPED;
4657 rb_enc_mbcput(c, p, enc);
4658 r = rb_enc_precise_mbclen(p, p + len, enc);
4659 if (!MBCLEN_CHARFOUND_P(r)) {
4660 return NEIGHBOR_NOT_CHAR;
4661 }
4662 return NEIGHBOR_FOUND;
4663 }
4664 while (1) {
4665 for (i = len-1; 0 <= i && (unsigned char)p[i] == 0; i--)
4666 p[i] = '\xff';
4667 if (i < 0)
4668 return NEIGHBOR_WRAPPED;
4669 --((unsigned char*)p)[i];
4670 l = rb_enc_precise_mbclen(p, p+len, enc);
4671 if (MBCLEN_CHARFOUND_P(l)) {
4672 l = MBCLEN_CHARFOUND_LEN(l);
4673 if (l == len) {
4674 return NEIGHBOR_FOUND;
4675 }
4676 else {
4677 memset(p+l, 0, len-l);
4678 }
4679 }
4680 if (MBCLEN_INVALID_P(l) && i < len-1) {
4681 long len2;
4682 int l2;
4683 for (len2 = len-1; 0 < len2; len2--) {
4684 l2 = rb_enc_precise_mbclen(p, p+len2, enc);
4685 if (!MBCLEN_INVALID_P(l2))
4686 break;
4687 }
4688 memset(p+len2+1, 0, len-(len2+1));
4689 }
4690 }
4691}
4692
4693/*
4694 overwrite +p+ by succeeding letter in +enc+ and returns
4695 NEIGHBOR_FOUND or NEIGHBOR_WRAPPED.
4696 When NEIGHBOR_WRAPPED, carried-out letter is stored into carry.
4697 assuming each ranges are successive, and mbclen
4698 never change in each ranges.
4699 NEIGHBOR_NOT_CHAR is returned if invalid character or the range has only one
4700 character.
4701 */
4702static enum neighbor_char
4703enc_succ_alnum_char(char *p, long len, rb_encoding *enc, char *carry)
4704{
4705 enum neighbor_char ret;
4706 unsigned int c;
4707 int ctype;
4708 int range;
4709 char save[ONIGENC_CODE_TO_MBC_MAXLEN];
4710
4711 /* skip 03A2, invalid char between GREEK CAPITAL LETTERS */
4712 int try;
4713 const int max_gaps = 1;
4714
4715 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4716 if (rb_enc_isctype(c, ONIGENC_CTYPE_DIGIT, enc))
4717 ctype = ONIGENC_CTYPE_DIGIT;
4718 else if (rb_enc_isctype(c, ONIGENC_CTYPE_ALPHA, enc))
4719 ctype = ONIGENC_CTYPE_ALPHA;
4720 else
4721 return NEIGHBOR_NOT_CHAR;
4722
4723 MEMCPY(save, p, char, len);
4724 for (try = 0; try <= max_gaps; ++try) {
4725 ret = enc_succ_char(p, len, enc);
4726 if (ret == NEIGHBOR_FOUND) {
4727 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4728 if (rb_enc_isctype(c, ctype, enc))
4729 return NEIGHBOR_FOUND;
4730 }
4731 }
4732 MEMCPY(p, save, char, len);
4733 range = 1;
4734 while (1) {
4735 MEMCPY(save, p, char, len);
4736 ret = enc_pred_char(p, len, enc);
4737 if (ret == NEIGHBOR_FOUND) {
4738 c = rb_enc_mbc_to_codepoint(p, p+len, enc);
4739 if (!rb_enc_isctype(c, ctype, enc)) {
4740 MEMCPY(p, save, char, len);
4741 break;
4742 }
4743 }
4744 else {
4745 MEMCPY(p, save, char, len);
4746 break;
4747 }
4748 range++;
4749 }
4750 if (range == 1) {
4751 return NEIGHBOR_NOT_CHAR;
4752 }
4753
4754 if (ctype != ONIGENC_CTYPE_DIGIT) {
4755 MEMCPY(carry, p, char, len);
4756 return NEIGHBOR_WRAPPED;
4757 }
4758
4759 MEMCPY(carry, p, char, len);
4760 enc_succ_char(carry, len, enc);
4761 return NEIGHBOR_WRAPPED;
4762}
4763
4764
4765static VALUE str_succ(VALUE str);
4766
4767/*
4768 * call-seq:
4769 * succ -> new_str
4770 *
4771 * Returns the successor to +self+. The successor is calculated by
4772 * incrementing characters.
4773 *
4774 * The first character to be incremented is the rightmost alphanumeric:
4775 * or, if no alphanumerics, the rightmost character:
4776 *
4777 * 'THX1138'.succ # => "THX1139"
4778 * '<<koala>>'.succ # => "<<koalb>>"
4779 * '***'.succ # => '**+'
4780 *
4781 * The successor to a digit is another digit, "carrying" to the next-left
4782 * character for a "rollover" from 9 to 0, and prepending another digit
4783 * if necessary:
4784 *
4785 * '00'.succ # => "01"
4786 * '09'.succ # => "10"
4787 * '99'.succ # => "100"
4788 *
4789 * The successor to a letter is another letter of the same case,
4790 * carrying to the next-left character for a rollover,
4791 * and prepending another same-case letter if necessary:
4792 *
4793 * 'aa'.succ # => "ab"
4794 * 'az'.succ # => "ba"
4795 * 'zz'.succ # => "aaa"
4796 * 'AA'.succ # => "AB"
4797 * 'AZ'.succ # => "BA"
4798 * 'ZZ'.succ # => "AAA"
4799 *
4800 * The successor to a non-alphanumeric character is the next character
4801 * in the underlying character set's collating sequence,
4802 * carrying to the next-left character for a rollover,
4803 * and prepending another character if necessary:
4804 *
4805 * s = 0.chr * 3
4806 * s # => "\x00\x00\x00"
4807 * s.succ # => "\x00\x00\x01"
4808 * s = 255.chr * 3
4809 * s # => "\xFF\xFF\xFF"
4810 * s.succ # => "\x01\x00\x00\x00"
4811 *
4812 * Carrying can occur between and among mixtures of alphanumeric characters:
4813 *
4814 * s = 'zz99zz99'
4815 * s.succ # => "aaa00aa00"
4816 * s = '99zz99zz'
4817 * s.succ # => "100aa00aa"
4818 *
4819 * The successor to an empty \String is a new empty \String:
4820 *
4821 * ''.succ # => ""
4822 *
4823 */
4824
4825VALUE
4827{
4828 VALUE str;
4829 str = rb_str_new(RSTRING_PTR(orig), RSTRING_LEN(orig));
4830 rb_enc_cr_str_copy_for_substr(str, orig);
4831 return str_succ(str);
4832}
4833
4834static VALUE
4835str_succ(VALUE str)
4836{
4837 rb_encoding *enc;
4838 char *sbeg, *s, *e, *last_alnum = 0;
4839 int found_alnum = 0;
4840 long l, slen;
4841 char carry[ONIGENC_CODE_TO_MBC_MAXLEN] = "\1";
4842 long carry_pos = 0, carry_len = 1;
4843 enum neighbor_char neighbor = NEIGHBOR_FOUND;
4844
4845 slen = RSTRING_LEN(str);
4846 if (slen == 0) return str;
4847
4848 enc = STR_ENC_GET(str);
4849 sbeg = RSTRING_PTR(str);
4850 s = e = sbeg + slen;
4851
4852 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4853 if (neighbor == NEIGHBOR_NOT_CHAR && last_alnum) {
4854 if (ISALPHA(*last_alnum) ? ISDIGIT(*s) :
4855 ISDIGIT(*last_alnum) ? ISALPHA(*s) : 0) {
4856 break;
4857 }
4858 }
4859 l = rb_enc_precise_mbclen(s, e, enc);
4860 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4861 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4862 neighbor = enc_succ_alnum_char(s, l, enc, carry);
4863 switch (neighbor) {
4864 case NEIGHBOR_NOT_CHAR:
4865 continue;
4866 case NEIGHBOR_FOUND:
4867 return str;
4868 case NEIGHBOR_WRAPPED:
4869 last_alnum = s;
4870 break;
4871 }
4872 found_alnum = 1;
4873 carry_pos = s - sbeg;
4874 carry_len = l;
4875 }
4876 if (!found_alnum) { /* str contains no alnum */
4877 s = e;
4878 while ((s = rb_enc_prev_char(sbeg, s, e, enc)) != 0) {
4879 enum neighbor_char neighbor;
4880 char tmp[ONIGENC_CODE_TO_MBC_MAXLEN];
4881 l = rb_enc_precise_mbclen(s, e, enc);
4882 if (!ONIGENC_MBCLEN_CHARFOUND_P(l)) continue;
4883 l = ONIGENC_MBCLEN_CHARFOUND_LEN(l);
4884 MEMCPY(tmp, s, char, l);
4885 neighbor = enc_succ_char(tmp, l, enc);
4886 switch (neighbor) {
4887 case NEIGHBOR_FOUND:
4888 MEMCPY(s, tmp, char, l);
4889 return str;
4890 break;
4891 case NEIGHBOR_WRAPPED:
4892 MEMCPY(s, tmp, char, l);
4893 break;
4894 case NEIGHBOR_NOT_CHAR:
4895 break;
4896 }
4897 if (rb_enc_precise_mbclen(s, s+l, enc) != l) {
4898 /* wrapped to \0...\0. search next valid char. */
4899 enc_succ_char(s, l, enc);
4900 }
4901 if (!rb_enc_asciicompat(enc)) {
4902 MEMCPY(carry, s, char, l);
4903 carry_len = l;
4904 }
4905 carry_pos = s - sbeg;
4906 }
4908 }
4909 RESIZE_CAPA(str, slen + carry_len);
4910 sbeg = RSTRING_PTR(str);
4911 s = sbeg + carry_pos;
4912 memmove(s + carry_len, s, slen - carry_pos);
4913 memmove(s, carry, carry_len);
4914 slen += carry_len;
4915 STR_SET_LEN(str, slen);
4916 TERM_FILL(&sbeg[slen], rb_enc_mbminlen(enc));
4918 return str;
4919}
4920
4921
4922/*
4923 * call-seq:
4924 * succ! -> self
4925 *
4926 * Equivalent to String#succ, but modifies +self+ in place; returns +self+.
4927 */
4928
4929static VALUE
4930rb_str_succ_bang(VALUE str)
4931{
4932 rb_str_modify(str);
4933 str_succ(str);
4934 return str;
4935}
4936
4937static int
4938all_digits_p(const char *s, long len)
4939{
4940 while (len-- > 0) {
4941 if (!ISDIGIT(*s)) return 0;
4942 s++;
4943 }
4944 return 1;
4945}
4946
4947static int
4948str_upto_i(VALUE str, VALUE arg)
4949{
4950 rb_yield(str);
4951 return 0;
4952}
4953
4954/*
4955 * call-seq:
4956 * upto(other_string, exclusive = false) {|string| ... } -> self
4957 * upto(other_string, exclusive = false) -> new_enumerator
4958 *
4959 * With a block given, calls the block with each \String value
4960 * returned by successive calls to String#succ;
4961 * the first value is +self+, the next is <tt>self.succ</tt>, and so on;
4962 * the sequence terminates when value +other_string+ is reached;
4963 * returns +self+:
4964 *
4965 * 'a8'.upto('b6') {|s| print s, ' ' } # => "a8"
4966 * Output:
4967 *
4968 * a8 a9 b0 b1 b2 b3 b4 b5 b6
4969 *
4970 * If argument +exclusive+ is given as a truthy object, the last value is omitted:
4971 *
4972 * 'a8'.upto('b6', true) {|s| print s, ' ' } # => "a8"
4973 *
4974 * Output:
4975 *
4976 * a8 a9 b0 b1 b2 b3 b4 b5
4977 *
4978 * If +other_string+ would not be reached, does not call the block:
4979 *
4980 * '25'.upto('5') {|s| fail s }
4981 * 'aa'.upto('a') {|s| fail s }
4982 *
4983 * With no block given, returns a new Enumerator:
4984 *
4985 * 'a8'.upto('b6') # => #<Enumerator: "a8":upto("b6")>
4986 *
4987 */
4988
4989static VALUE
4990rb_str_upto(int argc, VALUE *argv, VALUE beg)
4991{
4992 VALUE end, exclusive;
4993
4994 rb_scan_args(argc, argv, "11", &end, &exclusive);
4995 RETURN_ENUMERATOR(beg, argc, argv);
4996 return rb_str_upto_each(beg, end, RTEST(exclusive), str_upto_i, Qnil);
4997}
4998
4999VALUE
5000rb_str_upto_each(VALUE beg, VALUE end, int excl, int (*each)(VALUE, VALUE), VALUE arg)
5001{
5002 VALUE current, after_end;
5003 ID succ;
5004 int n, ascii;
5005 rb_encoding *enc;
5006
5007 CONST_ID(succ, "succ");
5008 StringValue(end);
5009 enc = rb_enc_check(beg, end);
5010 ascii = (is_ascii_string(beg) && is_ascii_string(end));
5011 /* single character */
5012 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1 && ascii) {
5013 char c = RSTRING_PTR(beg)[0];
5014 char e = RSTRING_PTR(end)[0];
5015
5016 if (c > e || (excl && c == e)) return beg;
5017 for (;;) {
5018 if ((*each)(rb_enc_str_new(&c, 1, enc), arg)) break;
5019 if (!excl && c == e) break;
5020 c++;
5021 if (excl && c == e) break;
5022 }
5023 return beg;
5024 }
5025 /* both edges are all digits */
5026 if (ascii && ISDIGIT(RSTRING_PTR(beg)[0]) && ISDIGIT(RSTRING_PTR(end)[0]) &&
5027 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg)) &&
5028 all_digits_p(RSTRING_PTR(end), RSTRING_LEN(end))) {
5029 VALUE b, e;
5030 int width;
5031
5032 width = RSTRING_LENINT(beg);
5033 b = rb_str_to_inum(beg, 10, FALSE);
5034 e = rb_str_to_inum(end, 10, FALSE);
5035 if (FIXNUM_P(b) && FIXNUM_P(e)) {
5036 long bi = FIX2LONG(b);
5037 long ei = FIX2LONG(e);
5038 rb_encoding *usascii = rb_usascii_encoding();
5039
5040 while (bi <= ei) {
5041 if (excl && bi == ei) break;
5042 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5043 bi++;
5044 }
5045 }
5046 else {
5047 ID op = excl ? '<' : idLE;
5048 VALUE args[2], fmt = rb_fstring_lit("%.*d");
5049
5050 args[0] = INT2FIX(width);
5051 while (rb_funcall(b, op, 1, e)) {
5052 args[1] = b;
5053 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5054 b = rb_funcallv(b, succ, 0, 0);
5055 }
5056 }
5057 return beg;
5058 }
5059 /* normal case */
5060 n = rb_str_cmp(beg, end);
5061 if (n > 0 || (excl && n == 0)) return beg;
5062
5063 after_end = rb_funcallv(end, succ, 0, 0);
5064 current = str_duplicate(rb_cString, beg);
5065 while (!rb_str_equal(current, after_end)) {
5066 VALUE next = Qnil;
5067 if (excl || !rb_str_equal(current, end))
5068 next = rb_funcallv(current, succ, 0, 0);
5069 if ((*each)(current, arg)) break;
5070 if (NIL_P(next)) break;
5071 current = next;
5072 StringValue(current);
5073 if (excl && rb_str_equal(current, end)) break;
5074 if (RSTRING_LEN(current) > RSTRING_LEN(end) || RSTRING_LEN(current) == 0)
5075 break;
5076 }
5077
5078 return beg;
5079}
5080
5081VALUE
5082rb_str_upto_endless_each(VALUE beg, int (*each)(VALUE, VALUE), VALUE arg)
5083{
5084 VALUE current;
5085 ID succ;
5086
5087 CONST_ID(succ, "succ");
5088 /* both edges are all digits */
5089 if (is_ascii_string(beg) && ISDIGIT(RSTRING_PTR(beg)[0]) &&
5090 all_digits_p(RSTRING_PTR(beg), RSTRING_LEN(beg))) {
5091 VALUE b, args[2], fmt = rb_fstring_lit("%.*d");
5092 int width = RSTRING_LENINT(beg);
5093 b = rb_str_to_inum(beg, 10, FALSE);
5094 if (FIXNUM_P(b)) {
5095 long bi = FIX2LONG(b);
5096 rb_encoding *usascii = rb_usascii_encoding();
5097
5098 while (FIXABLE(bi)) {
5099 if ((*each)(rb_enc_sprintf(usascii, "%.*ld", width, bi), arg)) break;
5100 bi++;
5101 }
5102 b = LONG2NUM(bi);
5103 }
5104 args[0] = INT2FIX(width);
5105 while (1) {
5106 args[1] = b;
5107 if ((*each)(rb_str_format(numberof(args), args, fmt), arg)) break;
5108 b = rb_funcallv(b, succ, 0, 0);
5109 }
5110 }
5111 /* normal case */
5112 current = str_duplicate(rb_cString, beg);
5113 while (1) {
5114 VALUE next = rb_funcallv(current, succ, 0, 0);
5115 if ((*each)(current, arg)) break;
5116 current = next;
5117 StringValue(current);
5118 if (RSTRING_LEN(current) == 0)
5119 break;
5120 }
5121
5122 return beg;
5123}
5124
5125static int
5126include_range_i(VALUE str, VALUE arg)
5127{
5128 VALUE *argp = (VALUE *)arg;
5129 if (!rb_equal(str, *argp)) return 0;
5130 *argp = Qnil;
5131 return 1;
5132}
5133
5134VALUE
5135rb_str_include_range_p(VALUE beg, VALUE end, VALUE val, VALUE exclusive)
5136{
5137 beg = rb_str_new_frozen(beg);
5138 StringValue(end);
5139 end = rb_str_new_frozen(end);
5140 if (NIL_P(val)) return Qfalse;
5141 val = rb_check_string_type(val);
5142 if (NIL_P(val)) return Qfalse;
5143 if (rb_enc_asciicompat(STR_ENC_GET(beg)) &&
5144 rb_enc_asciicompat(STR_ENC_GET(end)) &&
5145 rb_enc_asciicompat(STR_ENC_GET(val))) {
5146 const char *bp = RSTRING_PTR(beg);
5147 const char *ep = RSTRING_PTR(end);
5148 const char *vp = RSTRING_PTR(val);
5149 if (RSTRING_LEN(beg) == 1 && RSTRING_LEN(end) == 1) {
5150 if (RSTRING_LEN(val) == 0 || RSTRING_LEN(val) > 1)
5151 return Qfalse;
5152 else {
5153 char b = *bp;
5154 char e = *ep;
5155 char v = *vp;
5156
5157 if (ISASCII(b) && ISASCII(e) && ISASCII(v)) {
5158 if (b <= v && v < e) return Qtrue;
5159 return RBOOL(!RTEST(exclusive) && v == e);
5160 }
5161 }
5162 }
5163#if 0
5164 /* both edges are all digits */
5165 if (ISDIGIT(*bp) && ISDIGIT(*ep) &&
5166 all_digits_p(bp, RSTRING_LEN(beg)) &&
5167 all_digits_p(ep, RSTRING_LEN(end))) {
5168 /* TODO */
5169 }
5170#endif
5171 }
5172 rb_str_upto_each(beg, end, RTEST(exclusive), include_range_i, (VALUE)&val);
5173
5174 return RBOOL(NIL_P(val));
5175}
5176
5177static VALUE
5178rb_str_subpat(VALUE str, VALUE re, VALUE backref)
5179{
5180 if (rb_reg_search(re, str, 0, 0) >= 0) {
5181 VALUE match = rb_backref_get();
5182 int nth = rb_reg_backref_number(match, backref);
5183 return rb_reg_nth_match(nth, match);
5184 }
5185 return Qnil;
5186}
5187
5188static VALUE
5189rb_str_aref(VALUE str, VALUE indx)
5190{
5191 long idx;
5192
5193 if (FIXNUM_P(indx)) {
5194 idx = FIX2LONG(indx);
5195 }
5196 else if (RB_TYPE_P(indx, T_REGEXP)) {
5197 return rb_str_subpat(str, indx, INT2FIX(0));
5198 }
5199 else if (RB_TYPE_P(indx, T_STRING)) {
5200 if (rb_str_index(str, indx, 0) != -1)
5201 return str_duplicate(rb_cString, indx);
5202 return Qnil;
5203 }
5204 else {
5205 /* check if indx is Range */
5206 long beg, len = str_strlen(str, NULL);
5207 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
5208 case Qfalse:
5209 break;
5210 case Qnil:
5211 return Qnil;
5212 default:
5213 return rb_str_substr(str, beg, len);
5214 }
5215 idx = NUM2LONG(indx);
5216 }
5217
5218 return str_substr(str, idx, 1, FALSE);
5219}
5220
5221
5222/*
5223 * call-seq:
5224 * string[index] -> new_string or nil
5225 * string[start, length] -> new_string or nil
5226 * string[range] -> new_string or nil
5227 * string[regexp, capture = 0] -> new_string or nil
5228 * string[substring] -> new_string or nil
5229 *
5230 * Returns the substring of +self+ specified by the arguments.
5231 * See examples at {String Slices}[rdoc-ref:String@String+Slices].
5232 *
5233 *
5234 */
5235
5236static VALUE
5237rb_str_aref_m(int argc, VALUE *argv, VALUE str)
5238{
5239 if (argc == 2) {
5240 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5241 return rb_str_subpat(str, argv[0], argv[1]);
5242 }
5243 else {
5244 long beg = NUM2LONG(argv[0]);
5245 long len = NUM2LONG(argv[1]);
5246 return rb_str_substr(str, beg, len);
5247 }
5248 }
5249 rb_check_arity(argc, 1, 2);
5250 return rb_str_aref(str, argv[0]);
5251}
5252
5253VALUE
5255{
5256 char *ptr = RSTRING_PTR(str);
5257 long olen = RSTRING_LEN(str), nlen;
5258
5259 str_modifiable(str);
5260 if (len > olen) len = olen;
5261 nlen = olen - len;
5262 if (str_embed_capa(str) >= nlen + TERM_LEN(str)) {
5263 char *oldptr = ptr;
5264 int fl = (int)(RBASIC(str)->flags & (STR_NOEMBED|STR_SHARED|STR_NOFREE));
5265 STR_SET_EMBED(str);
5266 ptr = RSTRING(str)->as.embed.ary;
5267 memmove(ptr, oldptr + len, nlen);
5268 if (fl == STR_NOEMBED) xfree(oldptr);
5269 }
5270 else {
5271 if (!STR_SHARED_P(str)) {
5272 VALUE shared = heap_str_make_shared(rb_obj_class(str), str);
5273 rb_enc_cr_str_exact_copy(shared, str);
5274 OBJ_FREEZE(shared);
5275 }
5276 ptr = RSTRING(str)->as.heap.ptr += len;
5277 }
5278 STR_SET_LEN(str, nlen);
5279
5280 if (!SHARABLE_MIDDLE_SUBSTRING) {
5281 TERM_FILL(ptr + nlen, TERM_LEN(str));
5282 }
5284 return str;
5285}
5286
5287static void
5288rb_str_update_1(VALUE str, long beg, long len, VALUE val, long vbeg, long vlen)
5289{
5290 char *sptr;
5291 long slen;
5292 int cr;
5293
5294 if (beg == 0 && vlen == 0) {
5295 rb_str_drop_bytes(str, len);
5296 return;
5297 }
5298
5299 str_modify_keep_cr(str);
5300 RSTRING_GETMEM(str, sptr, slen);
5301 if (len < vlen) {
5302 /* expand string */
5303 RESIZE_CAPA(str, slen + vlen - len);
5304 sptr = RSTRING_PTR(str);
5305 }
5306
5308 cr = rb_enc_str_coderange(val);
5309 else
5311
5312 if (vlen != len) {
5313 memmove(sptr + beg + vlen,
5314 sptr + beg + len,
5315 slen - (beg + len));
5316 }
5317 if (vlen < beg && len < 0) {
5318 MEMZERO(sptr + slen, char, -len);
5319 }
5320 if (vlen > 0) {
5321 memmove(sptr + beg, RSTRING_PTR(val) + vbeg, vlen);
5322 }
5323 slen += vlen - len;
5324 STR_SET_LEN(str, slen);
5325 TERM_FILL(&sptr[slen], TERM_LEN(str));
5326 ENC_CODERANGE_SET(str, cr);
5327}
5328
5329static inline void
5330rb_str_update_0(VALUE str, long beg, long len, VALUE val)
5331{
5332 rb_str_update_1(str, beg, len, val, 0, RSTRING_LEN(val));
5333}
5334
5335void
5336rb_str_update(VALUE str, long beg, long len, VALUE val)
5337{
5338 long slen;
5339 char *p, *e;
5340 rb_encoding *enc;
5341 int singlebyte = single_byte_optimizable(str);
5342 int cr;
5343
5344 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
5345
5346 StringValue(val);
5347 enc = rb_enc_check(str, val);
5348 slen = str_strlen(str, enc); /* rb_enc_check */
5349
5350 if ((slen < beg) || ((beg < 0) && (beg + slen < 0))) {
5351 rb_raise(rb_eIndexError, "index %ld out of string", beg);
5352 }
5353 if (beg < 0) {
5354 beg += slen;
5355 }
5356 assert(beg >= 0);
5357 assert(beg <= slen);
5358 if (len > slen - beg) {
5359 len = slen - beg;
5360 }
5361 p = str_nth(RSTRING_PTR(str), RSTRING_END(str), beg, enc, singlebyte);
5362 if (!p) p = RSTRING_END(str);
5363 e = str_nth(p, RSTRING_END(str), len, enc, singlebyte);
5364 if (!e) e = RSTRING_END(str);
5365 /* error check */
5366 beg = p - RSTRING_PTR(str); /* physical position */
5367 len = e - p; /* physical length */
5368 rb_str_update_0(str, beg, len, val);
5369 rb_enc_associate(str, enc);
5371 if (cr != ENC_CODERANGE_BROKEN)
5372 ENC_CODERANGE_SET(str, cr);
5373}
5374
5375static void
5376rb_str_subpat_set(VALUE str, VALUE re, VALUE backref, VALUE val)
5377{
5378 int nth;
5379 VALUE match;
5380 long start, end, len;
5381 rb_encoding *enc;
5382 struct re_registers *regs;
5383
5384 if (rb_reg_search(re, str, 0, 0) < 0) {
5385 rb_raise(rb_eIndexError, "regexp not matched");
5386 }
5387 match = rb_backref_get();
5388 nth = rb_reg_backref_number(match, backref);
5389 regs = RMATCH_REGS(match);
5390 if ((nth >= regs->num_regs) || ((nth < 0) && (-nth >= regs->num_regs))) {
5391 rb_raise(rb_eIndexError, "index %d out of regexp", nth);
5392 }
5393 if (nth < 0) {
5394 nth += regs->num_regs;
5395 }
5396
5397 start = BEG(nth);
5398 if (start == -1) {
5399 rb_raise(rb_eIndexError, "regexp group %d not matched", nth);
5400 }
5401 end = END(nth);
5402 len = end - start;
5403 StringValue(val);
5404 enc = rb_enc_check_str(str, val);
5405 rb_str_update_0(str, start, len, val);
5406 rb_enc_associate(str, enc);
5407}
5408
5409static VALUE
5410rb_str_aset(VALUE str, VALUE indx, VALUE val)
5411{
5412 long idx, beg;
5413
5414 switch (TYPE(indx)) {
5415 case T_REGEXP:
5416 rb_str_subpat_set(str, indx, INT2FIX(0), val);
5417 return val;
5418
5419 case T_STRING:
5420 beg = rb_str_index(str, indx, 0);
5421 if (beg < 0) {
5422 rb_raise(rb_eIndexError, "string not matched");
5423 }
5424 beg = rb_str_sublen(str, beg);
5425 rb_str_update(str, beg, str_strlen(indx, NULL), val);
5426 return val;
5427
5428 default:
5429 /* check if indx is Range */
5430 {
5431 long beg, len;
5432 if (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 2)) {
5433 rb_str_update(str, beg, len, val);
5434 return val;
5435 }
5436 }
5437 /* FALLTHROUGH */
5438
5439 case T_FIXNUM:
5440 idx = NUM2LONG(indx);
5441 rb_str_update(str, idx, 1, val);
5442 return val;
5443 }
5444}
5445
5446/*
5447 * call-seq:
5448 * string[index] = new_string
5449 * string[start, length] = new_string
5450 * string[range] = new_string
5451 * string[regexp, capture = 0] = new_string
5452 * string[substring] = new_string
5453 *
5454 * Replaces all, some, or none of the contents of +self+; returns +new_string+.
5455 * See {String Slices}[rdoc-ref:String@String+Slices].
5456 *
5457 * A few examples:
5458 *
5459 * s = 'foo'
5460 * s[2] = 'rtune' # => "rtune"
5461 * s # => "fortune"
5462 * s[1, 5] = 'init' # => "init"
5463 * s # => "finite"
5464 * s[3..4] = 'al' # => "al"
5465 * s # => "finale"
5466 * s[/e$/] = 'ly' # => "ly"
5467 * s # => "finally"
5468 * s['lly'] = 'ncial' # => "ncial"
5469 * s # => "financial"
5470 *
5471 */
5472
5473static VALUE
5474rb_str_aset_m(int argc, VALUE *argv, VALUE str)
5475{
5476 if (argc == 3) {
5477 if (RB_TYPE_P(argv[0], T_REGEXP)) {
5478 rb_str_subpat_set(str, argv[0], argv[1], argv[2]);
5479 }
5480 else {
5481 rb_str_update(str, NUM2LONG(argv[0]), NUM2LONG(argv[1]), argv[2]);
5482 }
5483 return argv[2];
5484 }
5485 rb_check_arity(argc, 2, 3);
5486 return rb_str_aset(str, argv[0], argv[1]);
5487}
5488
5489/*
5490 * call-seq:
5491 * insert(index, other_string) -> self
5492 *
5493 * Inserts the given +other_string+ into +self+; returns +self+.
5494 *
5495 * If the Integer +index+ is positive, inserts +other_string+ at offset +index+:
5496 *
5497 * 'foo'.insert(1, 'bar') # => "fbaroo"
5498 *
5499 * If the Integer +index+ is negative, counts backward from the end of +self+
5500 * and inserts +other_string+ at offset <tt>index+1</tt>
5501 * (that is, _after_ <tt>self[index]</tt>):
5502 *
5503 * 'foo'.insert(-2, 'bar') # => "fobaro"
5504 *
5505 */
5506
5507static VALUE
5508rb_str_insert(VALUE str, VALUE idx, VALUE str2)
5509{
5510 long pos = NUM2LONG(idx);
5511
5512 if (pos == -1) {
5513 return rb_str_append(str, str2);
5514 }
5515 else if (pos < 0) {
5516 pos++;
5517 }
5518 rb_str_update(str, pos, 0, str2);
5519 return str;
5520}
5521
5522
5523/*
5524 * call-seq:
5525 * slice!(index) -> new_string or nil
5526 * slice!(start, length) -> new_string or nil
5527 * slice!(range) -> new_string or nil
5528 * slice!(regexp, capture = 0) -> new_string or nil
5529 * slice!(substring) -> new_string or nil
5530 *
5531 * Removes and returns the substring of +self+ specified by the arguments.
5532 * See {String Slices}[rdoc-ref:String@String+Slices].
5533 *
5534 * A few examples:
5535 *
5536 * string = "This is a string"
5537 * string.slice!(2) #=> "i"
5538 * string.slice!(3..6) #=> " is "
5539 * string.slice!(/s.*t/) #=> "sa st"
5540 * string.slice!("r") #=> "r"
5541 * string #=> "Thing"
5542 *
5543 */
5544
5545static VALUE
5546rb_str_slice_bang(int argc, VALUE *argv, VALUE str)
5547{
5548 VALUE result = Qnil;
5549 VALUE indx;
5550 long beg, len = 1;
5551 char *p;
5552
5553 rb_check_arity(argc, 1, 2);
5554 str_modify_keep_cr(str);
5555 indx = argv[0];
5556 if (RB_TYPE_P(indx, T_REGEXP)) {
5557 if (rb_reg_search(indx, str, 0, 0) < 0) return Qnil;
5558 VALUE match = rb_backref_get();
5559 struct re_registers *regs = RMATCH_REGS(match);
5560 int nth = 0;
5561 if (argc > 1 && (nth = rb_reg_backref_number(match, argv[1])) < 0) {
5562 if ((nth += regs->num_regs) <= 0) return Qnil;
5563 }
5564 else if (nth >= regs->num_regs) return Qnil;
5565 beg = BEG(nth);
5566 len = END(nth) - beg;
5567 goto subseq;
5568 }
5569 else if (argc == 2) {
5570 beg = NUM2LONG(indx);
5571 len = NUM2LONG(argv[1]);
5572 goto num_index;
5573 }
5574 else if (FIXNUM_P(indx)) {
5575 beg = FIX2LONG(indx);
5576 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5577 if (!len) return Qnil;
5578 beg = p - RSTRING_PTR(str);
5579 goto subseq;
5580 }
5581 else if (RB_TYPE_P(indx, T_STRING)) {
5582 beg = rb_str_index(str, indx, 0);
5583 if (beg == -1) return Qnil;
5584 len = RSTRING_LEN(indx);
5585 result = str_duplicate(rb_cString, indx);
5586 goto squash;
5587 }
5588 else {
5589 switch (rb_range_beg_len(indx, &beg, &len, str_strlen(str, NULL), 0)) {
5590 case Qnil:
5591 return Qnil;
5592 case Qfalse:
5593 beg = NUM2LONG(indx);
5594 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5595 if (!len) return Qnil;
5596 beg = p - RSTRING_PTR(str);
5597 goto subseq;
5598 default:
5599 goto num_index;
5600 }
5601 }
5602
5603 num_index:
5604 if (!(p = rb_str_subpos(str, beg, &len))) return Qnil;
5605 beg = p - RSTRING_PTR(str);
5606
5607 subseq:
5608 result = rb_str_new(RSTRING_PTR(str)+beg, len);
5609 rb_enc_cr_str_copy_for_substr(result, str);
5610
5611 squash:
5612 if (len > 0) {
5613 if (beg == 0) {
5614 rb_str_drop_bytes(str, len);
5615 }
5616 else {
5617 char *sptr = RSTRING_PTR(str);
5618 long slen = RSTRING_LEN(str);
5619 if (beg + len > slen) /* pathological check */
5620 len = slen - beg;
5621 memmove(sptr + beg,
5622 sptr + beg + len,
5623 slen - (beg + len));
5624 slen -= len;
5625 STR_SET_LEN(str, slen);
5626 TERM_FILL(&sptr[slen], TERM_LEN(str));
5627 }
5628 }
5629 return result;
5630}
5631
5632static VALUE
5633get_pat(VALUE pat)
5634{
5635 VALUE val;
5636
5637 switch (OBJ_BUILTIN_TYPE(pat)) {
5638 case T_REGEXP:
5639 return pat;
5640
5641 case T_STRING:
5642 break;
5643
5644 default:
5645 val = rb_check_string_type(pat);
5646 if (NIL_P(val)) {
5647 Check_Type(pat, T_REGEXP);
5648 }
5649 pat = val;
5650 }
5651
5652 return rb_reg_regcomp(pat);
5653}
5654
5655static VALUE
5656get_pat_quoted(VALUE pat, int check)
5657{
5658 VALUE val;
5659
5660 switch (OBJ_BUILTIN_TYPE(pat)) {
5661 case T_REGEXP:
5662 return pat;
5663
5664 case T_STRING:
5665 break;
5666
5667 default:
5668 val = rb_check_string_type(pat);
5669 if (NIL_P(val)) {
5670 Check_Type(pat, T_REGEXP);
5671 }
5672 pat = val;
5673 }
5674 if (check && is_broken_string(pat)) {
5675 rb_exc_raise(rb_reg_check_preprocess(pat));
5676 }
5677 return pat;
5678}
5679
5680static long
5681rb_pat_search(VALUE pat, VALUE str, long pos, int set_backref_str)
5682{
5683 if (BUILTIN_TYPE(pat) == T_STRING) {
5684 pos = rb_str_byteindex(str, pat, pos);
5685 if (set_backref_str) {
5686 if (pos >= 0) {
5687 str = rb_str_new_frozen_String(str);
5688 rb_backref_set_string(str, pos, RSTRING_LEN(pat));
5689 }
5690 else {
5692 }
5693 }
5694 return pos;
5695 }
5696 else {
5697 return rb_reg_search0(pat, str, pos, 0, set_backref_str);
5698 }
5699}
5700
5701
5702/*
5703 * call-seq:
5704 * sub!(pattern, replacement) -> self or nil
5705 * sub!(pattern) {|match| ... } -> self or nil
5706 *
5707 * Returns +self+ with only the first occurrence
5708 * (not all occurrences) of the given +pattern+ replaced.
5709 *
5710 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5711 *
5712 * Related: String#sub, String#gsub, String#gsub!.
5713 *
5714 */
5715
5716static VALUE
5717rb_str_sub_bang(int argc, VALUE *argv, VALUE str)
5718{
5719 VALUE pat, repl, hash = Qnil;
5720 int iter = 0;
5721 long plen;
5722 int min_arity = rb_block_given_p() ? 1 : 2;
5723 long beg;
5724
5725 rb_check_arity(argc, min_arity, 2);
5726 if (argc == 1) {
5727 iter = 1;
5728 }
5729 else {
5730 repl = argv[1];
5731 hash = rb_check_hash_type(argv[1]);
5732 if (NIL_P(hash)) {
5733 StringValue(repl);
5734 }
5735 }
5736
5737 pat = get_pat_quoted(argv[0], 1);
5738
5739 str_modifiable(str);
5740 beg = rb_pat_search(pat, str, 0, 1);
5741 if (beg >= 0) {
5742 rb_encoding *enc;
5743 int cr = ENC_CODERANGE(str);
5744 long beg0, end0;
5745 VALUE match, match0 = Qnil;
5746 struct re_registers *regs;
5747 char *p, *rp;
5748 long len, rlen;
5749
5750 match = rb_backref_get();
5751 regs = RMATCH_REGS(match);
5752 if (RB_TYPE_P(pat, T_STRING)) {
5753 beg0 = beg;
5754 end0 = beg0 + RSTRING_LEN(pat);
5755 match0 = pat;
5756 }
5757 else {
5758 beg0 = BEG(0);
5759 end0 = END(0);
5760 if (iter) match0 = rb_reg_nth_match(0, match);
5761 }
5762
5763 if (iter || !NIL_P(hash)) {
5764 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5765
5766 if (iter) {
5767 repl = rb_obj_as_string(rb_yield(match0));
5768 }
5769 else {
5770 repl = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5771 repl = rb_obj_as_string(repl);
5772 }
5773 str_mod_check(str, p, len);
5774 rb_check_frozen(str);
5775 }
5776 else {
5777 repl = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5778 }
5779
5780 enc = rb_enc_compatible(str, repl);
5781 if (!enc) {
5782 rb_encoding *str_enc = STR_ENC_GET(str);
5783 p = RSTRING_PTR(str); len = RSTRING_LEN(str);
5784 if (coderange_scan(p, beg0, str_enc) != ENC_CODERANGE_7BIT ||
5785 coderange_scan(p+end0, len-end0, str_enc) != ENC_CODERANGE_7BIT) {
5786 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
5787 rb_enc_name(str_enc),
5788 rb_enc_name(STR_ENC_GET(repl)));
5789 }
5790 enc = STR_ENC_GET(repl);
5791 }
5792 rb_str_modify(str);
5793 rb_enc_associate(str, enc);
5795 int cr2 = ENC_CODERANGE(repl);
5796 if (cr2 == ENC_CODERANGE_BROKEN ||
5797 (cr == ENC_CODERANGE_VALID && cr2 == ENC_CODERANGE_7BIT))
5799 else
5800 cr = cr2;
5801 }
5802 plen = end0 - beg0;
5803 rlen = RSTRING_LEN(repl);
5804 len = RSTRING_LEN(str);
5805 if (rlen > plen) {
5806 RESIZE_CAPA(str, len + rlen - plen);
5807 }
5808 p = RSTRING_PTR(str);
5809 if (rlen != plen) {
5810 memmove(p + beg0 + rlen, p + beg0 + plen, len - beg0 - plen);
5811 }
5812 rp = RSTRING_PTR(repl);
5813 memmove(p + beg0, rp, rlen);
5814 len += rlen - plen;
5815 STR_SET_LEN(str, len);
5816 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
5817 ENC_CODERANGE_SET(str, cr);
5818
5819 RB_GC_GUARD(match);
5820
5821 return str;
5822 }
5823 return Qnil;
5824}
5825
5826
5827/*
5828 * call-seq:
5829 * sub(pattern, replacement) -> new_string
5830 * sub(pattern) {|match| ... } -> new_string
5831 *
5832 * Returns a copy of +self+ with only the first occurrence
5833 * (not all occurrences) of the given +pattern+ replaced.
5834 *
5835 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5836 *
5837 * Related: String#sub!, String#gsub, String#gsub!.
5838 *
5839 */
5840
5841static VALUE
5842rb_str_sub(int argc, VALUE *argv, VALUE str)
5843{
5844 str = str_duplicate(rb_cString, str);
5845 rb_str_sub_bang(argc, argv, str);
5846 return str;
5847}
5848
5849static VALUE
5850str_gsub(int argc, VALUE *argv, VALUE str, int bang)
5851{
5852 VALUE pat, val = Qnil, repl, match0 = Qnil, dest, hash = Qnil;
5853 long beg, beg0, end0;
5854 long offset, blen, slen, len, last;
5855 enum {STR, ITER, MAP} mode = STR;
5856 char *sp, *cp;
5857 int need_backref = -1;
5858 rb_encoding *str_enc;
5859
5860 switch (argc) {
5861 case 1:
5862 RETURN_ENUMERATOR(str, argc, argv);
5863 mode = ITER;
5864 break;
5865 case 2:
5866 repl = argv[1];
5867 hash = rb_check_hash_type(argv[1]);
5868 if (NIL_P(hash)) {
5869 StringValue(repl);
5870 }
5871 else {
5872 mode = MAP;
5873 }
5874 break;
5875 default:
5876 rb_error_arity(argc, 1, 2);
5877 }
5878
5879 pat = get_pat_quoted(argv[0], 1);
5880 beg = rb_pat_search(pat, str, 0, need_backref);
5881 if (beg < 0) {
5882 if (bang) return Qnil; /* no match, no substitution */
5883 return str_duplicate(rb_cString, str);
5884 }
5885
5886 offset = 0;
5887 blen = RSTRING_LEN(str) + 30; /* len + margin */
5888 dest = rb_str_buf_new(blen);
5889 sp = RSTRING_PTR(str);
5890 slen = RSTRING_LEN(str);
5891 cp = sp;
5892 str_enc = STR_ENC_GET(str);
5893 rb_enc_associate(dest, str_enc);
5894 ENC_CODERANGE_SET(dest, rb_enc_asciicompat(str_enc) ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID);
5895
5896 do {
5897 VALUE match = rb_backref_get();
5898 struct re_registers *regs = RMATCH_REGS(match);
5899 if (RB_TYPE_P(pat, T_STRING)) {
5900 beg0 = beg;
5901 end0 = beg0 + RSTRING_LEN(pat);
5902 match0 = pat;
5903 }
5904 else {
5905 beg0 = BEG(0);
5906 end0 = END(0);
5907 if (mode == ITER) match0 = rb_reg_nth_match(0, match);
5908 }
5909
5910 if (mode) {
5911 if (mode == ITER) {
5912 val = rb_obj_as_string(rb_yield(match0));
5913 }
5914 else {
5915 val = rb_hash_aref(hash, rb_str_subseq(str, beg0, end0 - beg0));
5916 val = rb_obj_as_string(val);
5917 }
5918 str_mod_check(str, sp, slen);
5919 if (val == dest) { /* paranoid check [ruby-dev:24827] */
5920 rb_raise(rb_eRuntimeError, "block should not cheat");
5921 }
5922 }
5923 else if (need_backref) {
5924 val = rb_reg_regsub(repl, str, regs, RB_TYPE_P(pat, T_STRING) ? Qnil : pat);
5925 if (need_backref < 0) {
5926 need_backref = val != repl;
5927 }
5928 }
5929 else {
5930 val = repl;
5931 }
5932
5933 len = beg0 - offset; /* copy pre-match substr */
5934 if (len) {
5935 rb_enc_str_buf_cat(dest, cp, len, str_enc);
5936 }
5937
5938 rb_str_buf_append(dest, val);
5939
5940 last = offset;
5941 offset = end0;
5942 if (beg0 == end0) {
5943 /*
5944 * Always consume at least one character of the input string
5945 * in order to prevent infinite loops.
5946 */
5947 if (RSTRING_LEN(str) <= end0) break;
5948 len = rb_enc_fast_mbclen(RSTRING_PTR(str)+end0, RSTRING_END(str), str_enc);
5949 rb_enc_str_buf_cat(dest, RSTRING_PTR(str)+end0, len, str_enc);
5950 offset = end0 + len;
5951 }
5952 cp = RSTRING_PTR(str) + offset;
5953 if (offset > RSTRING_LEN(str)) break;
5954 beg = rb_pat_search(pat, str, offset, need_backref);
5955
5956 RB_GC_GUARD(match);
5957 } while (beg >= 0);
5958 if (RSTRING_LEN(str) > offset) {
5959 rb_enc_str_buf_cat(dest, cp, RSTRING_LEN(str) - offset, str_enc);
5960 }
5961 rb_pat_search(pat, str, last, 1);
5962 if (bang) {
5963 str_shared_replace(str, dest);
5964 }
5965 else {
5966 str = dest;
5967 }
5968
5969 return str;
5970}
5971
5972
5973/*
5974 * call-seq:
5975 * gsub!(pattern, replacement) -> self or nil
5976 * gsub!(pattern) {|match| ... } -> self or nil
5977 * gsub!(pattern) -> an_enumerator
5978 *
5979 * Performs the specified substring replacement(s) on +self+;
5980 * returns +self+ if any replacement occurred, +nil+ otherwise.
5981 *
5982 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
5983 *
5984 * Returns an Enumerator if no +replacement+ and no block given.
5985 *
5986 * Related: String#sub, String#gsub, String#sub!.
5987 *
5988 */
5989
5990static VALUE
5991rb_str_gsub_bang(int argc, VALUE *argv, VALUE str)
5992{
5993 str_modify_keep_cr(str);
5994 return str_gsub(argc, argv, str, 1);
5995}
5996
5997
5998/*
5999 * call-seq:
6000 * gsub(pattern, replacement) -> new_string
6001 * gsub(pattern) {|match| ... } -> new_string
6002 * gsub(pattern) -> enumerator
6003 *
6004 * Returns a copy of +self+ with all occurrences of the given +pattern+ replaced.
6005 *
6006 * See {Substitution Methods}[rdoc-ref:String@Substitution+Methods].
6007 *
6008 * Returns an Enumerator if no +replacement+ and no block given.
6009 *
6010 * Related: String#sub, String#sub!, String#gsub!.
6011 *
6012 */
6013
6014static VALUE
6015rb_str_gsub(int argc, VALUE *argv, VALUE str)
6016{
6017 return str_gsub(argc, argv, str, 0);
6018}
6019
6020
6021/*
6022 * call-seq:
6023 * replace(other_string) -> self
6024 *
6025 * Replaces the contents of +self+ with the contents of +other_string+:
6026 *
6027 * s = 'foo' # => "foo"
6028 * s.replace('bar') # => "bar"
6029 *
6030 */
6031
6032VALUE
6034{
6035 str_modifiable(str);
6036 if (str == str2) return str;
6037
6038 StringValue(str2);
6039 str_discard(str);
6040 return str_replace(str, str2);
6041}
6042
6043/*
6044 * call-seq:
6045 * clear -> self
6046 *
6047 * Removes the contents of +self+:
6048 *
6049 * s = 'foo' # => "foo"
6050 * s.clear # => ""
6051 *
6052 */
6053
6054static VALUE
6055rb_str_clear(VALUE str)
6056{
6057 str_discard(str);
6058 STR_SET_EMBED(str);
6059 STR_SET_LEN(str, 0);
6060 RSTRING_PTR(str)[0] = 0;
6061 if (rb_enc_asciicompat(STR_ENC_GET(str)))
6063 else
6065 return str;
6066}
6067
6068/*
6069 * call-seq:
6070 * chr -> string
6071 *
6072 * Returns a string containing the first character of +self+:
6073 *
6074 * s = 'foo' # => "foo"
6075 * s.chr # => "f"
6076 *
6077 */
6078
6079static VALUE
6080rb_str_chr(VALUE str)
6081{
6082 return rb_str_substr(str, 0, 1);
6083}
6084
6085/*
6086 * call-seq:
6087 * getbyte(index) -> integer or nil
6088 *
6089 * Returns the byte at zero-based +index+ as an integer, or +nil+ if +index+ is out of range:
6090 *
6091 * s = 'abcde' # => "abcde"
6092 * s.getbyte(0) # => 97
6093 * s.getbyte(-1) # => 101
6094 * s.getbyte(5) # => nil
6095 *
6096 * Related: String#setbyte.
6097 */
6098VALUE
6099rb_str_getbyte(VALUE str, VALUE index)
6100{
6101 long pos = NUM2LONG(index);
6102
6103 if (pos < 0)
6104 pos += RSTRING_LEN(str);
6105 if (pos < 0 || RSTRING_LEN(str) <= pos)
6106 return Qnil;
6107
6108 return INT2FIX((unsigned char)RSTRING_PTR(str)[pos]);
6109}
6110
6111/*
6112 * call-seq:
6113 * setbyte(index, integer) -> integer
6114 *
6115 * Sets the byte at zero-based +index+ to +integer+; returns +integer+:
6116 *
6117 * s = 'abcde' # => "abcde"
6118 * s.setbyte(0, 98) # => 98
6119 * s # => "bbcde"
6120 *
6121 * Related: String#getbyte.
6122 */
6123static VALUE
6124rb_str_setbyte(VALUE str, VALUE index, VALUE value)
6125{
6126 long pos = NUM2LONG(index);
6127 long len = RSTRING_LEN(str);
6128 char *ptr, *head, *left = 0;
6129 rb_encoding *enc;
6130 int cr = ENC_CODERANGE_UNKNOWN, width, nlen;
6131
6132 if (pos < -len || len <= pos)
6133 rb_raise(rb_eIndexError, "index %ld out of string", pos);
6134 if (pos < 0)
6135 pos += len;
6136
6137 VALUE v = rb_to_int(value);
6138 VALUE w = rb_int_and(v, INT2FIX(0xff));
6139 char byte = (char)(NUM2INT(w) & 0xFF);
6140
6141 if (!str_independent(str))
6142 str_make_independent(str);
6143 enc = STR_ENC_GET(str);
6144 head = RSTRING_PTR(str);
6145 ptr = &head[pos];
6146 if (!STR_EMBED_P(str)) {
6147 cr = ENC_CODERANGE(str);
6148 switch (cr) {
6149 case ENC_CODERANGE_7BIT:
6150 left = ptr;
6151 *ptr = byte;
6152 if (ISASCII(byte)) goto end;
6153 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6154 if (!MBCLEN_CHARFOUND_P(nlen))
6156 else
6158 goto end;
6160 left = rb_enc_left_char_head(head, ptr, head+len, enc);
6161 width = rb_enc_precise_mbclen(left, head+len, enc);
6162 *ptr = byte;
6163 nlen = rb_enc_precise_mbclen(left, head+len, enc);
6164 if (!MBCLEN_CHARFOUND_P(nlen))
6166 else if (MBCLEN_CHARFOUND_LEN(nlen) != width || ISASCII(byte))
6168 goto end;
6169 }
6170 }
6172 *ptr = byte;
6173
6174 end:
6175 return value;
6176}
6177
6178static VALUE
6179str_byte_substr(VALUE str, long beg, long len, int empty)
6180{
6181 long n = RSTRING_LEN(str);
6182
6183 if (beg > n || len < 0) return Qnil;
6184 if (beg < 0) {
6185 beg += n;
6186 if (beg < 0) return Qnil;
6187 }
6188 if (len > n - beg)
6189 len = n - beg;
6190 if (len <= 0) {
6191 if (!empty) return Qnil;
6192 len = 0;
6193 }
6194
6195 VALUE str2 = str_subseq(str, beg, len);
6196
6197 str_enc_copy_direct(str2, str);
6198
6199 if (RSTRING_LEN(str2) == 0) {
6200 if (!rb_enc_asciicompat(STR_ENC_GET(str)))
6202 else
6204 }
6205 else {
6206 switch (ENC_CODERANGE(str)) {
6207 case ENC_CODERANGE_7BIT:
6209 break;
6210 default:
6212 break;
6213 }
6214 }
6215
6216 return str2;
6217}
6218
6219static VALUE
6220str_byte_aref(VALUE str, VALUE indx)
6221{
6222 long idx;
6223 if (FIXNUM_P(indx)) {
6224 idx = FIX2LONG(indx);
6225 }
6226 else {
6227 /* check if indx is Range */
6228 long beg, len = RSTRING_LEN(str);
6229
6230 switch (rb_range_beg_len(indx, &beg, &len, len, 0)) {
6231 case Qfalse:
6232 break;
6233 case Qnil:
6234 return Qnil;
6235 default:
6236 return str_byte_substr(str, beg, len, TRUE);
6237 }
6238
6239 idx = NUM2LONG(indx);
6240 }
6241 return str_byte_substr(str, idx, 1, FALSE);
6242}
6243
6244/*
6245 * call-seq:
6246 * byteslice(index, length = 1) -> string or nil
6247 * byteslice(range) -> string or nil
6248 *
6249 * Returns a substring of +self+, or +nil+ if the substring cannot be constructed.
6250 *
6251 * With integer arguments +index+ and +length+ given,
6252 * returns the substring beginning at the given +index+
6253 * of the given +length+ (if possible),
6254 * or +nil+ if +length+ is negative or +index+ falls outside of +self+:
6255 *
6256 * s = '0123456789' # => "0123456789"
6257 * s.byteslice(2) # => "2"
6258 * s.byteslice(200) # => nil
6259 * s.byteslice(4, 3) # => "456"
6260 * s.byteslice(4, 30) # => "456789"
6261 * s.byteslice(4, -1) # => nil
6262 * s.byteslice(40, 2) # => nil
6263 *
6264 * In either case above, counts backwards from the end of +self+
6265 * if +index+ is negative:
6266 *
6267 * s = '0123456789' # => "0123456789"
6268 * s.byteslice(-4) # => "6"
6269 * s.byteslice(-4, 3) # => "678"
6270 *
6271 * With Range argument +range+ given, returns
6272 * <tt>byteslice(range.begin, range.size)</tt>:
6273 *
6274 * s = '0123456789' # => "0123456789"
6275 * s.byteslice(4..6) # => "456"
6276 * s.byteslice(-6..-4) # => "456"
6277 * s.byteslice(5..2) # => "" # range.size is zero.
6278 * s.byteslice(40..42) # => nil
6279 *
6280 * In all cases, a returned string has the same encoding as +self+:
6281 *
6282 * s.encoding # => #<Encoding:UTF-8>
6283 * s.byteslice(4).encoding # => #<Encoding:UTF-8>
6284 *
6285 */
6286
6287static VALUE
6288rb_str_byteslice(int argc, VALUE *argv, VALUE str)
6289{
6290 if (argc == 2) {
6291 long beg = NUM2LONG(argv[0]);
6292 long len = NUM2LONG(argv[1]);
6293 return str_byte_substr(str, beg, len, TRUE);
6294 }
6295 rb_check_arity(argc, 1, 2);
6296 return str_byte_aref(str, argv[0]);
6297}
6298
6299static void
6300str_check_beg_len(VALUE str, long *beg, long *len)
6301{
6302 long end, slen = RSTRING_LEN(str);
6303
6304 if (*len < 0) rb_raise(rb_eIndexError, "negative length %ld", *len);
6305 if ((slen < *beg) || ((*beg < 0) && (*beg + slen < 0))) {
6306 rb_raise(rb_eIndexError, "index %ld out of string", *beg);
6307 }
6308 if (*beg < 0) {
6309 *beg += slen;
6310 }
6311 assert(*beg >= 0);
6312 assert(*beg <= slen);
6313 if (*len > slen - *beg) {
6314 *len = slen - *beg;
6315 }
6316 end = *beg + *len;
6317 str_ensure_byte_pos(str, *beg);
6318 str_ensure_byte_pos(str, end);
6319}
6320
6321/*
6322 * call-seq:
6323 * bytesplice(index, length, str) -> string
6324 * bytesplice(index, length, str, str_index, str_length) -> string
6325 * bytesplice(range, str) -> string
6326 * bytesplice(range, str, str_range) -> string
6327 *
6328 * Replaces some or all of the content of +self+ with +str+, and returns +self+.
6329 * The portion of the string affected is determined using
6330 * the same criteria as String#byteslice, except that +length+ cannot be omitted.
6331 * If the replacement string is not the same length as the text it is replacing,
6332 * the string will be adjusted accordingly.
6333 *
6334 * If +str_index+ and +str_length+, or +str_range+ are given, the content of +self+ is replaced by str.byteslice(str_index, str_length) or str.byteslice(str_range); however the substring of +str+ is not allocated as a new string.
6335 *
6336 * The form that take an Integer will raise an IndexError if the value is out
6337 * of range; the Range form will raise a RangeError.
6338 * If the beginning or ending offset does not land on character (codepoint)
6339 * boundary, an IndexError will be raised.
6340 */
6341
6342static VALUE
6343rb_str_bytesplice(int argc, VALUE *argv, VALUE str)
6344{
6345 long beg, len, vbeg, vlen;
6346 VALUE val;
6347 rb_encoding *enc;
6348 int cr;
6349
6350 rb_check_arity(argc, 2, 5);
6351 if (!(argc == 2 || argc == 3 || argc == 5)) {
6352 rb_raise(rb_eArgError, "wrong number of arguments (given %d, expected 2, 3, or 5)", argc);
6353 }
6354 if (argc == 2 || (argc == 3 && !RB_INTEGER_TYPE_P(argv[0]))) {
6355 if (!rb_range_beg_len(argv[0], &beg, &len, RSTRING_LEN(str), 2)) {
6356 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6357 rb_builtin_class_name(argv[0]));
6358 }
6359 val = argv[1];
6360 StringValue(val);
6361 if (argc == 2) {
6362 /* bytesplice(range, str) */
6363 vbeg = 0;
6364 vlen = RSTRING_LEN(val);
6365 }
6366 else {
6367 /* bytesplice(range, str, str_range) */
6368 if (!rb_range_beg_len(argv[2], &vbeg, &vlen, RSTRING_LEN(val), 2)) {
6369 rb_raise(rb_eTypeError, "wrong argument type %s (expected Range)",
6370 rb_builtin_class_name(argv[2]));
6371 }
6372 }
6373 }
6374 else {
6375 beg = NUM2LONG(argv[0]);
6376 len = NUM2LONG(argv[1]);
6377 val = argv[2];
6378 StringValue(val);
6379 if (argc == 3) {
6380 /* bytesplice(index, length, str) */
6381 vbeg = 0;
6382 vlen = RSTRING_LEN(val);
6383 }
6384 else {
6385 /* bytesplice(index, length, str, str_index, str_length) */
6386 vbeg = NUM2LONG(argv[3]);
6387 vlen = NUM2LONG(argv[4]);
6388 }
6389 }
6390 str_check_beg_len(str, &beg, &len);
6391 str_check_beg_len(val, &vbeg, &vlen);
6392 enc = rb_enc_check(str, val);
6393 str_modify_keep_cr(str);
6394 rb_str_update_1(str, beg, len, val, vbeg, vlen);
6395 rb_enc_associate(str, enc);
6397 if (cr != ENC_CODERANGE_BROKEN)
6398 ENC_CODERANGE_SET(str, cr);
6399 return str;
6400}
6401
6402/*
6403 * call-seq:
6404 * reverse -> string
6405 *
6406 * Returns a new string with the characters from +self+ in reverse order.
6407 *
6408 * 'stressed'.reverse # => "desserts"
6409 *
6410 */
6411
6412static VALUE
6413rb_str_reverse(VALUE str)
6414{
6415 rb_encoding *enc;
6416 VALUE rev;
6417 char *s, *e, *p;
6418 int cr;
6419
6420 if (RSTRING_LEN(str) <= 1) return str_duplicate(rb_cString, str);
6421 enc = STR_ENC_GET(str);
6422 rev = rb_str_new(0, RSTRING_LEN(str));
6423 s = RSTRING_PTR(str); e = RSTRING_END(str);
6424 p = RSTRING_END(rev);
6425 cr = ENC_CODERANGE(str);
6426
6427 if (RSTRING_LEN(str) > 1) {
6428 if (single_byte_optimizable(str)) {
6429 while (s < e) {
6430 *--p = *s++;
6431 }
6432 }
6433 else if (cr == ENC_CODERANGE_VALID) {
6434 while (s < e) {
6435 int clen = rb_enc_fast_mbclen(s, e, enc);
6436
6437 p -= clen;
6438 memcpy(p, s, clen);
6439 s += clen;
6440 }
6441 }
6442 else {
6443 cr = rb_enc_asciicompat(enc) ?
6445 while (s < e) {
6446 int clen = rb_enc_mbclen(s, e, enc);
6447
6448 if (clen > 1 || (*s & 0x80)) cr = ENC_CODERANGE_UNKNOWN;
6449 p -= clen;
6450 memcpy(p, s, clen);
6451 s += clen;
6452 }
6453 }
6454 }
6455 STR_SET_LEN(rev, RSTRING_LEN(str));
6456 str_enc_copy_direct(rev, str);
6457 ENC_CODERANGE_SET(rev, cr);
6458
6459 return rev;
6460}
6461
6462
6463/*
6464 * call-seq:
6465 * reverse! -> self
6466 *
6467 * Returns +self+ with its characters reversed:
6468 *
6469 * s = 'stressed'
6470 * s.reverse! # => "desserts"
6471 * s # => "desserts"
6472 *
6473 */
6474
6475static VALUE
6476rb_str_reverse_bang(VALUE str)
6477{
6478 if (RSTRING_LEN(str) > 1) {
6479 if (single_byte_optimizable(str)) {
6480 char *s, *e, c;
6481
6482 str_modify_keep_cr(str);
6483 s = RSTRING_PTR(str);
6484 e = RSTRING_END(str) - 1;
6485 while (s < e) {
6486 c = *s;
6487 *s++ = *e;
6488 *e-- = c;
6489 }
6490 }
6491 else {
6492 str_shared_replace(str, rb_str_reverse(str));
6493 }
6494 }
6495 else {
6496 str_modify_keep_cr(str);
6497 }
6498 return str;
6499}
6500
6501
6502/*
6503 * call-seq:
6504 * include? other_string -> true or false
6505 *
6506 * Returns +true+ if +self+ contains +other_string+, +false+ otherwise:
6507 *
6508 * s = 'foo'
6509 * s.include?('f') # => true
6510 * s.include?('fo') # => true
6511 * s.include?('food') # => false
6512 *
6513 */
6514
6515VALUE
6516rb_str_include(VALUE str, VALUE arg)
6517{
6518 long i;
6519
6520 StringValue(arg);
6521 i = rb_str_index(str, arg, 0);
6522
6523 return RBOOL(i != -1);
6524}
6525
6526
6527/*
6528 * call-seq:
6529 * to_i(base = 10) -> integer
6530 *
6531 * Returns the result of interpreting leading characters in +self+
6532 * as an integer in the given +base+ (which must be in (0, 2..36)):
6533 *
6534 * '123456'.to_i # => 123456
6535 * '123def'.to_i(16) # => 1195503
6536 *
6537 * With +base+ zero, string +object+ may contain leading characters
6538 * to specify the actual base:
6539 *
6540 * '123def'.to_i(0) # => 123
6541 * '0123def'.to_i(0) # => 83
6542 * '0b123def'.to_i(0) # => 1
6543 * '0o123def'.to_i(0) # => 83
6544 * '0d123def'.to_i(0) # => 123
6545 * '0x123def'.to_i(0) # => 1195503
6546 *
6547 * Characters past a leading valid number (in the given +base+) are ignored:
6548 *
6549 * '12.345'.to_i # => 12
6550 * '12345'.to_i(2) # => 1
6551 *
6552 * Returns zero if there is no leading valid number:
6553 *
6554 * 'abcdef'.to_i # => 0
6555 * '2'.to_i(2) # => 0
6556 *
6557 */
6558
6559static VALUE
6560rb_str_to_i(int argc, VALUE *argv, VALUE str)
6561{
6562 int base = 10;
6563
6564 if (rb_check_arity(argc, 0, 1) && (base = NUM2INT(argv[0])) < 0) {
6565 rb_raise(rb_eArgError, "invalid radix %d", base);
6566 }
6567 return rb_str_to_inum(str, base, FALSE);
6568}
6569
6570
6571/*
6572 * call-seq:
6573 * to_f -> float
6574 *
6575 * Returns the result of interpreting leading characters in +self+ as a Float:
6576 *
6577 * '3.14159'.to_f # => 3.14159
6578 * '1.234e-2'.to_f # => 0.01234
6579 *
6580 * Characters past a leading valid number (in the given +base+) are ignored:
6581 *
6582 * '3.14 (pi to two places)'.to_f # => 3.14
6583 *
6584 * Returns zero if there is no leading valid number:
6585 *
6586 * 'abcdef'.to_f # => 0.0
6587 *
6588 */
6589
6590static VALUE
6591rb_str_to_f(VALUE str)
6592{
6593 return DBL2NUM(rb_str_to_dbl(str, FALSE));
6594}
6595
6596
6597/*
6598 * call-seq:
6599 * to_s -> self or string
6600 *
6601 * Returns +self+ if +self+ is a \String,
6602 * or +self+ converted to a \String if +self+ is a subclass of \String.
6603 */
6604
6605static VALUE
6606rb_str_to_s(VALUE str)
6607{
6608 if (rb_obj_class(str) != rb_cString) {
6609 return str_duplicate(rb_cString, str);
6610 }
6611 return str;
6612}
6613
6614#if 0
6615static void
6616str_cat_char(VALUE str, unsigned int c, rb_encoding *enc)
6617{
6618 char s[RUBY_MAX_CHAR_LEN];
6619 int n = rb_enc_codelen(c, enc);
6620
6621 rb_enc_mbcput(c, s, enc);
6622 rb_enc_str_buf_cat(str, s, n, enc);
6623}
6624#endif
6625
6626#define CHAR_ESC_LEN 13 /* sizeof(\x{ hex of 32bit unsigned int } \0) */
6627
6628int
6629rb_str_buf_cat_escaped_char(VALUE result, unsigned int c, int unicode_p)
6630{
6631 char buf[CHAR_ESC_LEN + 1];
6632 int l;
6633
6634#if SIZEOF_INT > 4
6635 c &= 0xffffffff;
6636#endif
6637 if (unicode_p) {
6638 if (c < 0x7F && ISPRINT(c)) {
6639 snprintf(buf, CHAR_ESC_LEN, "%c", c);
6640 }
6641 else if (c < 0x10000) {
6642 snprintf(buf, CHAR_ESC_LEN, "\\u%04X", c);
6643 }
6644 else {
6645 snprintf(buf, CHAR_ESC_LEN, "\\u{%X}", c);
6646 }
6647 }
6648 else {
6649 if (c < 0x100) {
6650 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", c);
6651 }
6652 else {
6653 snprintf(buf, CHAR_ESC_LEN, "\\x{%X}", c);
6654 }
6655 }
6656 l = (int)strlen(buf); /* CHAR_ESC_LEN cannot exceed INT_MAX */
6657 rb_str_buf_cat(result, buf, l);
6658 return l;
6659}
6660
6661const char *
6662ruby_escaped_char(int c)
6663{
6664 switch (c) {
6665 case '\0': return "\\0";
6666 case '\n': return "\\n";
6667 case '\r': return "\\r";
6668 case '\t': return "\\t";
6669 case '\f': return "\\f";
6670 case '\013': return "\\v";
6671 case '\010': return "\\b";
6672 case '\007': return "\\a";
6673 case '\033': return "\\e";
6674 case '\x7f': return "\\c?";
6675 }
6676 return NULL;
6677}
6678
6679VALUE
6680rb_str_escape(VALUE str)
6681{
6682 int encidx = ENCODING_GET(str);
6683 rb_encoding *enc = rb_enc_from_index(encidx);
6684 const char *p = RSTRING_PTR(str);
6685 const char *pend = RSTRING_END(str);
6686 const char *prev = p;
6687 char buf[CHAR_ESC_LEN + 1];
6688 VALUE result = rb_str_buf_new(0);
6689 int unicode_p = rb_enc_unicode_p(enc);
6690 int asciicompat = rb_enc_asciicompat(enc);
6691
6692 while (p < pend) {
6693 unsigned int c;
6694 const char *cc;
6695 int n = rb_enc_precise_mbclen(p, pend, enc);
6696 if (!MBCLEN_CHARFOUND_P(n)) {
6697 if (p > prev) str_buf_cat(result, prev, p - prev);
6698 n = rb_enc_mbminlen(enc);
6699 if (pend < p + n)
6700 n = (int)(pend - p);
6701 while (n--) {
6702 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6703 str_buf_cat(result, buf, strlen(buf));
6704 prev = ++p;
6705 }
6706 continue;
6707 }
6708 n = MBCLEN_CHARFOUND_LEN(n);
6709 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6710 p += n;
6711 cc = ruby_escaped_char(c);
6712 if (cc) {
6713 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6714 str_buf_cat(result, cc, strlen(cc));
6715 prev = p;
6716 }
6717 else if (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c)) {
6718 }
6719 else {
6720 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6721 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6722 prev = p;
6723 }
6724 }
6725 if (p > prev) str_buf_cat(result, prev, p - prev);
6726 ENCODING_CODERANGE_SET(result, rb_usascii_encindex(), ENC_CODERANGE_7BIT);
6727
6728 return result;
6729}
6730
6731/*
6732 * call-seq:
6733 * inspect -> string
6734 *
6735 * Returns a printable version of +self+, enclosed in double-quotes,
6736 * and with special characters escaped:
6737 *
6738 * s = "foo\tbar\tbaz\n"
6739 * s.inspect
6740 * # => "\"foo\\tbar\\tbaz\\n\""
6741 *
6742 */
6743
6744VALUE
6746{
6747 int encidx = ENCODING_GET(str);
6748 rb_encoding *enc = rb_enc_from_index(encidx);
6749 const char *p, *pend, *prev;
6750 char buf[CHAR_ESC_LEN + 1];
6751 VALUE result = rb_str_buf_new(0);
6752 rb_encoding *resenc = rb_default_internal_encoding();
6753 int unicode_p = rb_enc_unicode_p(enc);
6754 int asciicompat = rb_enc_asciicompat(enc);
6755
6756 if (resenc == NULL) resenc = rb_default_external_encoding();
6757 if (!rb_enc_asciicompat(resenc)) resenc = rb_usascii_encoding();
6758 rb_enc_associate(result, resenc);
6759 str_buf_cat2(result, "\"");
6760
6761 p = RSTRING_PTR(str); pend = RSTRING_END(str);
6762 prev = p;
6763 while (p < pend) {
6764 unsigned int c, cc;
6765 int n;
6766
6767 n = rb_enc_precise_mbclen(p, pend, enc);
6768 if (!MBCLEN_CHARFOUND_P(n)) {
6769 if (p > prev) str_buf_cat(result, prev, p - prev);
6770 n = rb_enc_mbminlen(enc);
6771 if (pend < p + n)
6772 n = (int)(pend - p);
6773 while (n--) {
6774 snprintf(buf, CHAR_ESC_LEN, "\\x%02X", *p & 0377);
6775 str_buf_cat(result, buf, strlen(buf));
6776 prev = ++p;
6777 }
6778 continue;
6779 }
6780 n = MBCLEN_CHARFOUND_LEN(n);
6781 c = rb_enc_mbc_to_codepoint(p, pend, enc);
6782 p += n;
6783 if ((asciicompat || unicode_p) &&
6784 (c == '"'|| c == '\\' ||
6785 (c == '#' &&
6786 p < pend &&
6787 MBCLEN_CHARFOUND_P(rb_enc_precise_mbclen(p,pend,enc)) &&
6788 (cc = rb_enc_codepoint(p,pend,enc),
6789 (cc == '$' || cc == '@' || cc == '{'))))) {
6790 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6791 str_buf_cat2(result, "\\");
6792 if (asciicompat || enc == resenc) {
6793 prev = p - n;
6794 continue;
6795 }
6796 }
6797 switch (c) {
6798 case '\n': cc = 'n'; break;
6799 case '\r': cc = 'r'; break;
6800 case '\t': cc = 't'; break;
6801 case '\f': cc = 'f'; break;
6802 case '\013': cc = 'v'; break;
6803 case '\010': cc = 'b'; break;
6804 case '\007': cc = 'a'; break;
6805 case 033: cc = 'e'; break;
6806 default: cc = 0; break;
6807 }
6808 if (cc) {
6809 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6810 buf[0] = '\\';
6811 buf[1] = (char)cc;
6812 str_buf_cat(result, buf, 2);
6813 prev = p;
6814 continue;
6815 }
6816 /* The special casing of 0x85 (NEXT_LINE) here is because
6817 * Oniguruma historically treats it as printable, but it
6818 * doesn't match the print POSIX bracket class or character
6819 * property in regexps.
6820 *
6821 * See Ruby Bug #16842 for details:
6822 * https://bugs.ruby-lang.org/issues/16842
6823 */
6824 if ((enc == resenc && rb_enc_isprint(c, enc) && c != 0x85) ||
6825 (asciicompat && rb_enc_isascii(c, enc) && ISPRINT(c))) {
6826 continue;
6827 }
6828 else {
6829 if (p - n > prev) str_buf_cat(result, prev, p - n - prev);
6830 rb_str_buf_cat_escaped_char(result, c, unicode_p);
6831 prev = p;
6832 continue;
6833 }
6834 }
6835 if (p > prev) str_buf_cat(result, prev, p - prev);
6836 str_buf_cat2(result, "\"");
6837
6838 return result;
6839}
6840
6841#define IS_EVSTR(p,e) ((p) < (e) && (*(p) == '$' || *(p) == '@' || *(p) == '{'))
6842
6843/*
6844 * call-seq:
6845 * dump -> string
6846 *
6847 * Returns a printable version of +self+, enclosed in double-quotes,
6848 * with special characters escaped, and with non-printing characters
6849 * replaced by hexadecimal notation:
6850 *
6851 * "hello \n ''".dump # => "\"hello \\n ''\""
6852 * "\f\x00\xff\\\"".dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
6853 *
6854 * Related: String#undump (inverse of String#dump).
6855 *
6856 */
6857
6858VALUE
6860{
6861 int encidx = rb_enc_get_index(str);
6862 rb_encoding *enc = rb_enc_from_index(encidx);
6863 long len;
6864 const char *p, *pend;
6865 char *q, *qend;
6866 VALUE result;
6867 int u8 = (encidx == rb_utf8_encindex());
6868 static const char nonascii_suffix[] = ".dup.force_encoding(\"%s\")";
6869
6870 len = 2; /* "" */
6871 if (!rb_enc_asciicompat(enc)) {
6872 len += strlen(nonascii_suffix) - rb_strlen_lit("%s");
6873 len += strlen(enc->name);
6874 }
6875
6876 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6877 while (p < pend) {
6878 int clen;
6879 unsigned char c = *p++;
6880
6881 switch (c) {
6882 case '"': case '\\':
6883 case '\n': case '\r':
6884 case '\t': case '\f':
6885 case '\013': case '\010': case '\007': case '\033':
6886 clen = 2;
6887 break;
6888
6889 case '#':
6890 clen = IS_EVSTR(p, pend) ? 2 : 1;
6891 break;
6892
6893 default:
6894 if (ISPRINT(c)) {
6895 clen = 1;
6896 }
6897 else {
6898 if (u8 && c > 0x7F) { /* \u notation */
6899 int n = rb_enc_precise_mbclen(p-1, pend, enc);
6900 if (MBCLEN_CHARFOUND_P(n)) {
6901 unsigned int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6902 if (cc <= 0xFFFF)
6903 clen = 6; /* \uXXXX */
6904 else if (cc <= 0xFFFFF)
6905 clen = 9; /* \u{XXXXX} */
6906 else
6907 clen = 10; /* \u{XXXXXX} */
6908 p += MBCLEN_CHARFOUND_LEN(n)-1;
6909 break;
6910 }
6911 }
6912 clen = 4; /* \xNN */
6913 }
6914 break;
6915 }
6916
6917 if (clen > LONG_MAX - len) {
6918 rb_raise(rb_eRuntimeError, "string size too big");
6919 }
6920 len += clen;
6921 }
6922
6923 result = rb_str_new(0, len);
6924 p = RSTRING_PTR(str); pend = p + RSTRING_LEN(str);
6925 q = RSTRING_PTR(result); qend = q + len + 1;
6926
6927 *q++ = '"';
6928 while (p < pend) {
6929 unsigned char c = *p++;
6930
6931 if (c == '"' || c == '\\') {
6932 *q++ = '\\';
6933 *q++ = c;
6934 }
6935 else if (c == '#') {
6936 if (IS_EVSTR(p, pend)) *q++ = '\\';
6937 *q++ = '#';
6938 }
6939 else if (c == '\n') {
6940 *q++ = '\\';
6941 *q++ = 'n';
6942 }
6943 else if (c == '\r') {
6944 *q++ = '\\';
6945 *q++ = 'r';
6946 }
6947 else if (c == '\t') {
6948 *q++ = '\\';
6949 *q++ = 't';
6950 }
6951 else if (c == '\f') {
6952 *q++ = '\\';
6953 *q++ = 'f';
6954 }
6955 else if (c == '\013') {
6956 *q++ = '\\';
6957 *q++ = 'v';
6958 }
6959 else if (c == '\010') {
6960 *q++ = '\\';
6961 *q++ = 'b';
6962 }
6963 else if (c == '\007') {
6964 *q++ = '\\';
6965 *q++ = 'a';
6966 }
6967 else if (c == '\033') {
6968 *q++ = '\\';
6969 *q++ = 'e';
6970 }
6971 else if (ISPRINT(c)) {
6972 *q++ = c;
6973 }
6974 else {
6975 *q++ = '\\';
6976 if (u8) {
6977 int n = rb_enc_precise_mbclen(p-1, pend, enc) - 1;
6978 if (MBCLEN_CHARFOUND_P(n)) {
6979 int cc = rb_enc_mbc_to_codepoint(p-1, pend, enc);
6980 p += n;
6981 if (cc <= 0xFFFF)
6982 snprintf(q, qend-q, "u%04X", cc); /* \uXXXX */
6983 else
6984 snprintf(q, qend-q, "u{%X}", cc); /* \u{XXXXX} or \u{XXXXXX} */
6985 q += strlen(q);
6986 continue;
6987 }
6988 }
6989 snprintf(q, qend-q, "x%02X", c);
6990 q += 3;
6991 }
6992 }
6993 *q++ = '"';
6994 *q = '\0';
6995 if (!rb_enc_asciicompat(enc)) {
6996 snprintf(q, qend-q, nonascii_suffix, enc->name);
6997 encidx = rb_ascii8bit_encindex();
6998 }
6999 /* result from dump is ASCII */
7000 rb_enc_associate_index(result, encidx);
7002 return result;
7003}
7004
7005static int
7006unescape_ascii(unsigned int c)
7007{
7008 switch (c) {
7009 case 'n':
7010 return '\n';
7011 case 'r':
7012 return '\r';
7013 case 't':
7014 return '\t';
7015 case 'f':
7016 return '\f';
7017 case 'v':
7018 return '\13';
7019 case 'b':
7020 return '\010';
7021 case 'a':
7022 return '\007';
7023 case 'e':
7024 return 033;
7025 }
7027}
7028
7029static void
7030undump_after_backslash(VALUE undumped, const char **ss, const char *s_end, rb_encoding **penc, bool *utf8, bool *binary)
7031{
7032 const char *s = *ss;
7033 unsigned int c;
7034 int codelen;
7035 size_t hexlen;
7036 unsigned char buf[6];
7037 static rb_encoding *enc_utf8 = NULL;
7038
7039 switch (*s) {
7040 case '\\':
7041 case '"':
7042 case '#':
7043 rb_str_cat(undumped, s, 1); /* cat itself */
7044 s++;
7045 break;
7046 case 'n':
7047 case 'r':
7048 case 't':
7049 case 'f':
7050 case 'v':
7051 case 'b':
7052 case 'a':
7053 case 'e':
7054 *buf = unescape_ascii(*s);
7055 rb_str_cat(undumped, (char *)buf, 1);
7056 s++;
7057 break;
7058 case 'u':
7059 if (*binary) {
7060 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7061 }
7062 *utf8 = true;
7063 if (++s >= s_end) {
7064 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7065 }
7066 if (enc_utf8 == NULL) enc_utf8 = rb_utf8_encoding();
7067 if (*penc != enc_utf8) {
7068 *penc = enc_utf8;
7069 rb_enc_associate(undumped, enc_utf8);
7070 }
7071 if (*s == '{') { /* handle \u{...} form */
7072 s++;
7073 for (;;) {
7074 if (s >= s_end) {
7075 rb_raise(rb_eRuntimeError, "unterminated Unicode escape");
7076 }
7077 if (*s == '}') {
7078 s++;
7079 break;
7080 }
7081 if (ISSPACE(*s)) {
7082 s++;
7083 continue;
7084 }
7085 c = scan_hex(s, s_end-s, &hexlen);
7086 if (hexlen == 0 || hexlen > 6) {
7087 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7088 }
7089 if (c > 0x10ffff) {
7090 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint (too large)");
7091 }
7092 if (0xd800 <= c && c <= 0xdfff) {
7093 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7094 }
7095 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7096 rb_str_cat(undumped, (char *)buf, codelen);
7097 s += hexlen;
7098 }
7099 }
7100 else { /* handle \uXXXX form */
7101 c = scan_hex(s, 4, &hexlen);
7102 if (hexlen != 4) {
7103 rb_raise(rb_eRuntimeError, "invalid Unicode escape");
7104 }
7105 if (0xd800 <= c && c <= 0xdfff) {
7106 rb_raise(rb_eRuntimeError, "invalid Unicode codepoint");
7107 }
7108 codelen = rb_enc_mbcput(c, (char *)buf, *penc);
7109 rb_str_cat(undumped, (char *)buf, codelen);
7110 s += hexlen;
7111 }
7112 break;
7113 case 'x':
7114 if (*utf8) {
7115 rb_raise(rb_eRuntimeError, "hex escape and Unicode escape are mixed");
7116 }
7117 *binary = true;
7118 if (++s >= s_end) {
7119 rb_raise(rb_eRuntimeError, "invalid hex escape");
7120 }
7121 *buf = scan_hex(s, 2, &hexlen);
7122 if (hexlen != 2) {
7123 rb_raise(rb_eRuntimeError, "invalid hex escape");
7124 }
7125 rb_str_cat(undumped, (char *)buf, 1);
7126 s += hexlen;
7127 break;
7128 default:
7129 rb_str_cat(undumped, s-1, 2);
7130 s++;
7131 }
7132
7133 *ss = s;
7134}
7135
7136static VALUE rb_str_is_ascii_only_p(VALUE str);
7137
7138/*
7139 * call-seq:
7140 * undump -> string
7141 *
7142 * Returns an unescaped version of +self+:
7143 *
7144 * s_orig = "\f\x00\xff\\\"" # => "\f\u0000\xFF\\\""
7145 * s_dumped = s_orig.dump # => "\"\\f\\x00\\xFF\\\\\\\"\""
7146 * s_undumped = s_dumped.undump # => "\f\u0000\xFF\\\""
7147 * s_undumped == s_orig # => true
7148 *
7149 * Related: String#dump (inverse of String#undump).
7150 *
7151 */
7152
7153static VALUE
7154str_undump(VALUE str)
7155{
7156 const char *s = RSTRING_PTR(str);
7157 const char *s_end = RSTRING_END(str);
7158 rb_encoding *enc = rb_enc_get(str);
7159 VALUE undumped = rb_enc_str_new(s, 0L, enc);
7160 bool utf8 = false;
7161 bool binary = false;
7162 int w;
7163
7165 if (rb_str_is_ascii_only_p(str) == Qfalse) {
7166 rb_raise(rb_eRuntimeError, "non-ASCII character detected");
7167 }
7168 if (!str_null_check(str, &w)) {
7169 rb_raise(rb_eRuntimeError, "string contains null byte");
7170 }
7171 if (RSTRING_LEN(str) < 2) goto invalid_format;
7172 if (*s != '"') goto invalid_format;
7173
7174 /* strip '"' at the start */
7175 s++;
7176
7177 for (;;) {
7178 if (s >= s_end) {
7179 rb_raise(rb_eRuntimeError, "unterminated dumped string");
7180 }
7181
7182 if (*s == '"') {
7183 /* epilogue */
7184 s++;
7185 if (s == s_end) {
7186 /* ascii compatible dumped string */
7187 break;
7188 }
7189 else {
7190 static const char force_encoding_suffix[] = ".force_encoding(\""; /* "\")" */
7191 static const char dup_suffix[] = ".dup";
7192 const char *encname;
7193 int encidx;
7194 ptrdiff_t size;
7195
7196 /* check separately for strings dumped by older versions */
7197 size = sizeof(dup_suffix) - 1;
7198 if (s_end - s > size && memcmp(s, dup_suffix, size) == 0) s += size;
7199
7200 size = sizeof(force_encoding_suffix) - 1;
7201 if (s_end - s <= size) goto invalid_format;
7202 if (memcmp(s, force_encoding_suffix, size) != 0) goto invalid_format;
7203 s += size;
7204
7205 if (utf8) {
7206 rb_raise(rb_eRuntimeError, "dumped string contained Unicode escape but used force_encoding");
7207 }
7208
7209 encname = s;
7210 s = memchr(s, '"', s_end-s);
7211 size = s - encname;
7212 if (!s) goto invalid_format;
7213 if (s_end - s != 2) goto invalid_format;
7214 if (s[0] != '"' || s[1] != ')') goto invalid_format;
7215
7216 encidx = rb_enc_find_index2(encname, (long)size);
7217 if (encidx < 0) {
7218 rb_raise(rb_eRuntimeError, "dumped string has unknown encoding name");
7219 }
7220 rb_enc_associate_index(undumped, encidx);
7221 }
7222 break;
7223 }
7224
7225 if (*s == '\\') {
7226 s++;
7227 if (s >= s_end) {
7228 rb_raise(rb_eRuntimeError, "invalid escape");
7229 }
7230 undump_after_backslash(undumped, &s, s_end, &enc, &utf8, &binary);
7231 }
7232 else {
7233 rb_str_cat(undumped, s++, 1);
7234 }
7235 }
7236
7237 RB_GC_GUARD(str);
7238
7239 return undumped;
7240invalid_format:
7241 rb_raise(rb_eRuntimeError, "invalid dumped string; not wrapped with '\"' nor '\"...\".force_encoding(\"...\")' form");
7242}
7243
7244static void
7245rb_str_check_dummy_enc(rb_encoding *enc)
7246{
7247 if (rb_enc_dummy_p(enc)) {
7248 rb_raise(rb_eEncCompatError, "incompatible encoding with this operation: %s",
7249 rb_enc_name(enc));
7250 }
7251}
7252
7253static rb_encoding *
7254str_true_enc(VALUE str)
7255{
7256 rb_encoding *enc = STR_ENC_GET(str);
7257 rb_str_check_dummy_enc(enc);
7258 return enc;
7259}
7260
7261static OnigCaseFoldType
7262check_case_options(int argc, VALUE *argv, OnigCaseFoldType flags)
7263{
7264 if (argc==0)
7265 return flags;
7266 if (argc>2)
7267 rb_raise(rb_eArgError, "too many options");
7268 if (argv[0]==sym_turkic) {
7269 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7270 if (argc==2) {
7271 if (argv[1]==sym_lithuanian)
7272 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7273 else
7274 rb_raise(rb_eArgError, "invalid second option");
7275 }
7276 }
7277 else if (argv[0]==sym_lithuanian) {
7278 flags |= ONIGENC_CASE_FOLD_LITHUANIAN;
7279 if (argc==2) {
7280 if (argv[1]==sym_turkic)
7281 flags |= ONIGENC_CASE_FOLD_TURKISH_AZERI;
7282 else
7283 rb_raise(rb_eArgError, "invalid second option");
7284 }
7285 }
7286 else if (argc>1)
7287 rb_raise(rb_eArgError, "too many options");
7288 else if (argv[0]==sym_ascii)
7289 flags |= ONIGENC_CASE_ASCII_ONLY;
7290 else if (argv[0]==sym_fold) {
7291 if ((flags & (ONIGENC_CASE_UPCASE|ONIGENC_CASE_DOWNCASE)) == ONIGENC_CASE_DOWNCASE)
7292 flags ^= ONIGENC_CASE_FOLD|ONIGENC_CASE_DOWNCASE;
7293 else
7294 rb_raise(rb_eArgError, "option :fold only allowed for downcasing");
7295 }
7296 else
7297 rb_raise(rb_eArgError, "invalid option");
7298 return flags;
7299}
7300
7301static inline bool
7302case_option_single_p(OnigCaseFoldType flags, rb_encoding *enc, VALUE str)
7303{
7304 if ((flags & ONIGENC_CASE_ASCII_ONLY) && (enc==rb_utf8_encoding() || rb_enc_mbmaxlen(enc) == 1))
7305 return true;
7306 return !(flags & ONIGENC_CASE_FOLD_TURKISH_AZERI) && ENC_CODERANGE(str) == ENC_CODERANGE_7BIT;
7307}
7308
7309/* 16 should be long enough to absorb any kind of single character length increase */
7310#define CASE_MAPPING_ADDITIONAL_LENGTH 20
7311#ifndef CASEMAP_DEBUG
7312# define CASEMAP_DEBUG 0
7313#endif
7314
7315struct mapping_buffer;
7316typedef struct mapping_buffer {
7317 size_t capa;
7318 size_t used;
7319 struct mapping_buffer *next;
7320 OnigUChar space[FLEX_ARY_LEN];
7322
7323static void
7324mapping_buffer_free(void *p)
7325{
7326 mapping_buffer *previous_buffer;
7327 mapping_buffer *current_buffer = p;
7328 while (current_buffer) {
7329 previous_buffer = current_buffer;
7330 current_buffer = current_buffer->next;
7331 ruby_sized_xfree(previous_buffer, previous_buffer->capa);
7332 }
7333}
7334
7335static const rb_data_type_t mapping_buffer_type = {
7336 "mapping_buffer",
7337 {0, mapping_buffer_free,},
7338 0, 0, RUBY_TYPED_FREE_IMMEDIATELY | RUBY_TYPED_WB_PROTECTED
7339};
7340
7341static VALUE
7342rb_str_casemap(VALUE source, OnigCaseFoldType *flags, rb_encoding *enc)
7343{
7344 VALUE target;
7345
7346 const OnigUChar *source_current, *source_end;
7347 int target_length = 0;
7348 VALUE buffer_anchor;
7349 mapping_buffer *current_buffer = 0;
7350 mapping_buffer **pre_buffer;
7351 size_t buffer_count = 0;
7352 int buffer_length_or_invalid;
7353
7354 if (RSTRING_LEN(source) == 0) return str_duplicate(rb_cString, source);
7355
7356 source_current = (OnigUChar*)RSTRING_PTR(source);
7357 source_end = (OnigUChar*)RSTRING_END(source);
7358
7359 buffer_anchor = TypedData_Wrap_Struct(0, &mapping_buffer_type, 0);
7360 pre_buffer = (mapping_buffer **)&DATA_PTR(buffer_anchor);
7361 while (source_current < source_end) {
7362 /* increase multiplier using buffer count to converge quickly */
7363 size_t capa = (size_t)(source_end-source_current)*++buffer_count + CASE_MAPPING_ADDITIONAL_LENGTH;
7364 if (CASEMAP_DEBUG) {
7365 fprintf(stderr, "Buffer allocation, capa is %"PRIuSIZE"\n", capa); /* for tuning */
7366 }
7367 current_buffer = xmalloc(offsetof(mapping_buffer, space) + capa);
7368 *pre_buffer = current_buffer;
7369 pre_buffer = &current_buffer->next;
7370 current_buffer->next = NULL;
7371 current_buffer->capa = capa;
7372 buffer_length_or_invalid = enc->case_map(flags,
7373 &source_current, source_end,
7374 current_buffer->space,
7375 current_buffer->space+current_buffer->capa,
7376 enc);
7377 if (buffer_length_or_invalid < 0) {
7378 current_buffer = DATA_PTR(buffer_anchor);
7379 DATA_PTR(buffer_anchor) = 0;
7380 mapping_buffer_free(current_buffer);
7381 rb_raise(rb_eArgError, "input string invalid");
7382 }
7383 target_length += current_buffer->used = buffer_length_or_invalid;
7384 }
7385 if (CASEMAP_DEBUG) {
7386 fprintf(stderr, "Buffer count is %"PRIuSIZE"\n", buffer_count); /* for tuning */
7387 }
7388
7389 if (buffer_count==1) {
7390 target = rb_str_new((const char*)current_buffer->space, target_length);
7391 }
7392 else {
7393 char *target_current;
7394
7395 target = rb_str_new(0, target_length);
7396 target_current = RSTRING_PTR(target);
7397 current_buffer = DATA_PTR(buffer_anchor);
7398 while (current_buffer) {
7399 memcpy(target_current, current_buffer->space, current_buffer->used);
7400 target_current += current_buffer->used;
7401 current_buffer = current_buffer->next;
7402 }
7403 }
7404 current_buffer = DATA_PTR(buffer_anchor);
7405 DATA_PTR(buffer_anchor) = 0;
7406 mapping_buffer_free(current_buffer);
7407
7408 RB_GC_GUARD(buffer_anchor);
7409
7410 /* TODO: check about string terminator character */
7411 str_enc_copy_direct(target, source);
7412 /*ENC_CODERANGE_SET(mapped, cr);*/
7413
7414 return target;
7415}
7416
7417static VALUE
7418rb_str_ascii_casemap(VALUE source, VALUE target, OnigCaseFoldType *flags, rb_encoding *enc)
7419{
7420 const OnigUChar *source_current, *source_end;
7421 OnigUChar *target_current, *target_end;
7422 long old_length = RSTRING_LEN(source);
7423 int length_or_invalid;
7424
7425 if (old_length == 0) return Qnil;
7426
7427 source_current = (OnigUChar*)RSTRING_PTR(source);
7428 source_end = (OnigUChar*)RSTRING_END(source);
7429 if (source == target) {
7430 target_current = (OnigUChar*)source_current;
7431 target_end = (OnigUChar*)source_end;
7432 }
7433 else {
7434 target_current = (OnigUChar*)RSTRING_PTR(target);
7435 target_end = (OnigUChar*)RSTRING_END(target);
7436 }
7437
7438 length_or_invalid = onigenc_ascii_only_case_map(flags,
7439 &source_current, source_end,
7440 target_current, target_end, enc);
7441 if (length_or_invalid < 0)
7442 rb_raise(rb_eArgError, "input string invalid");
7443 if (CASEMAP_DEBUG && length_or_invalid != old_length) {
7444 fprintf(stderr, "problem with rb_str_ascii_casemap"
7445 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7446 rb_raise(rb_eArgError, "internal problem with rb_str_ascii_casemap"
7447 "; old_length=%ld, new_length=%d\n", old_length, length_or_invalid);
7448 }
7449
7450 str_enc_copy(target, source);
7451
7452 return target;
7453}
7454
7455static bool
7456upcase_single(VALUE str)
7457{
7458 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7459 bool modified = false;
7460
7461 while (s < send) {
7462 unsigned int c = *(unsigned char*)s;
7463
7464 if ('a' <= c && c <= 'z') {
7465 *s = 'A' + (c - 'a');
7466 modified = true;
7467 }
7468 s++;
7469 }
7470 return modified;
7471}
7472
7473/*
7474 * call-seq:
7475 * upcase!(*options) -> self or nil
7476 *
7477 * Upcases the characters in +self+;
7478 * returns +self+ if any changes were made, +nil+ otherwise:
7479 *
7480 * s = 'Hello World!' # => "Hello World!"
7481 * s.upcase! # => "HELLO WORLD!"
7482 * s # => "HELLO WORLD!"
7483 * s.upcase! # => nil
7484 *
7485 * The casing may be affected by the given +options+;
7486 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7487 *
7488 * Related: String#upcase, String#downcase, String#downcase!.
7489 *
7490 */
7491
7492static VALUE
7493rb_str_upcase_bang(int argc, VALUE *argv, VALUE str)
7494{
7495 rb_encoding *enc;
7496 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7497
7498 flags = check_case_options(argc, argv, flags);
7499 str_modify_keep_cr(str);
7500 enc = str_true_enc(str);
7501 if (case_option_single_p(flags, enc, str)) {
7502 if (upcase_single(str))
7503 flags |= ONIGENC_CASE_MODIFIED;
7504 }
7505 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7506 rb_str_ascii_casemap(str, str, &flags, enc);
7507 else
7508 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7509
7510 if (ONIGENC_CASE_MODIFIED&flags) return str;
7511 return Qnil;
7512}
7513
7514
7515/*
7516 * call-seq:
7517 * upcase(*options) -> string
7518 *
7519 * Returns a string containing the upcased characters in +self+:
7520 *
7521 * s = 'Hello World!' # => "Hello World!"
7522 * s.upcase # => "HELLO WORLD!"
7523 *
7524 * The casing may be affected by the given +options+;
7525 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7526 *
7527 * Related: String#upcase!, String#downcase, String#downcase!.
7528 *
7529 */
7530
7531static VALUE
7532rb_str_upcase(int argc, VALUE *argv, VALUE str)
7533{
7534 rb_encoding *enc;
7535 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE;
7536 VALUE ret;
7537
7538 flags = check_case_options(argc, argv, flags);
7539 enc = str_true_enc(str);
7540 if (case_option_single_p(flags, enc, str)) {
7541 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7542 str_enc_copy_direct(ret, str);
7543 upcase_single(ret);
7544 }
7545 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7546 ret = rb_str_new(0, RSTRING_LEN(str));
7547 rb_str_ascii_casemap(str, ret, &flags, enc);
7548 }
7549 else {
7550 ret = rb_str_casemap(str, &flags, enc);
7551 }
7552
7553 return ret;
7554}
7555
7556static bool
7557downcase_single(VALUE str)
7558{
7559 char *s = RSTRING_PTR(str), *send = RSTRING_END(str);
7560 bool modified = false;
7561
7562 while (s < send) {
7563 unsigned int c = *(unsigned char*)s;
7564
7565 if ('A' <= c && c <= 'Z') {
7566 *s = 'a' + (c - 'A');
7567 modified = true;
7568 }
7569 s++;
7570 }
7571
7572 return modified;
7573}
7574
7575/*
7576 * call-seq:
7577 * downcase!(*options) -> self or nil
7578 *
7579 * Downcases the characters in +self+;
7580 * returns +self+ if any changes were made, +nil+ otherwise:
7581 *
7582 * s = 'Hello World!' # => "Hello World!"
7583 * s.downcase! # => "hello world!"
7584 * s # => "hello world!"
7585 * s.downcase! # => nil
7586 *
7587 * The casing may be affected by the given +options+;
7588 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7589 *
7590 * Related: String#downcase, String#upcase, String#upcase!.
7591 *
7592 */
7593
7594static VALUE
7595rb_str_downcase_bang(int argc, VALUE *argv, VALUE str)
7596{
7597 rb_encoding *enc;
7598 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7599
7600 flags = check_case_options(argc, argv, flags);
7601 str_modify_keep_cr(str);
7602 enc = str_true_enc(str);
7603 if (case_option_single_p(flags, enc, str)) {
7604 if (downcase_single(str))
7605 flags |= ONIGENC_CASE_MODIFIED;
7606 }
7607 else if (flags&ONIGENC_CASE_ASCII_ONLY)
7608 rb_str_ascii_casemap(str, str, &flags, enc);
7609 else
7610 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7611
7612 if (ONIGENC_CASE_MODIFIED&flags) return str;
7613 return Qnil;
7614}
7615
7616
7617/*
7618 * call-seq:
7619 * downcase(*options) -> string
7620 *
7621 * Returns a string containing the downcased characters in +self+:
7622 *
7623 * s = 'Hello World!' # => "Hello World!"
7624 * s.downcase # => "hello world!"
7625 *
7626 * The casing may be affected by the given +options+;
7627 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7628 *
7629 * Related: String#downcase!, String#upcase, String#upcase!.
7630 *
7631 */
7632
7633static VALUE
7634rb_str_downcase(int argc, VALUE *argv, VALUE str)
7635{
7636 rb_encoding *enc;
7637 OnigCaseFoldType flags = ONIGENC_CASE_DOWNCASE;
7638 VALUE ret;
7639
7640 flags = check_case_options(argc, argv, flags);
7641 enc = str_true_enc(str);
7642 if (case_option_single_p(flags, enc, str)) {
7643 ret = rb_str_new(RSTRING_PTR(str), RSTRING_LEN(str));
7644 str_enc_copy_direct(ret, str);
7645 downcase_single(ret);
7646 }
7647 else if (flags&ONIGENC_CASE_ASCII_ONLY) {
7648 ret = rb_str_new(0, RSTRING_LEN(str));
7649 rb_str_ascii_casemap(str, ret, &flags, enc);
7650 }
7651 else {
7652 ret = rb_str_casemap(str, &flags, enc);
7653 }
7654
7655 return ret;
7656}
7657
7658
7659/*
7660 * call-seq:
7661 * capitalize!(*options) -> self or nil
7662 *
7663 * Upcases the first character in +self+;
7664 * downcases the remaining characters;
7665 * returns +self+ if any changes were made, +nil+ otherwise:
7666 *
7667 * s = 'hello World!' # => "hello World!"
7668 * s.capitalize! # => "Hello world!"
7669 * s # => "Hello world!"
7670 * s.capitalize! # => nil
7671 *
7672 * The casing may be affected by the given +options+;
7673 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7674 *
7675 * Related: String#capitalize.
7676 *
7677 */
7678
7679static VALUE
7680rb_str_capitalize_bang(int argc, VALUE *argv, VALUE str)
7681{
7682 rb_encoding *enc;
7683 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7684
7685 flags = check_case_options(argc, argv, flags);
7686 str_modify_keep_cr(str);
7687 enc = str_true_enc(str);
7688 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7689 if (flags&ONIGENC_CASE_ASCII_ONLY)
7690 rb_str_ascii_casemap(str, str, &flags, enc);
7691 else
7692 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7693
7694 if (ONIGENC_CASE_MODIFIED&flags) return str;
7695 return Qnil;
7696}
7697
7698
7699/*
7700 * call-seq:
7701 * capitalize(*options) -> string
7702 *
7703 * Returns a string containing the characters in +self+;
7704 * the first character is upcased;
7705 * the remaining characters are downcased:
7706 *
7707 * s = 'hello World!' # => "hello World!"
7708 * s.capitalize # => "Hello world!"
7709 *
7710 * The casing may be affected by the given +options+;
7711 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7712 *
7713 * Related: String#capitalize!.
7714 *
7715 */
7716
7717static VALUE
7718rb_str_capitalize(int argc, VALUE *argv, VALUE str)
7719{
7720 rb_encoding *enc;
7721 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_TITLECASE;
7722 VALUE ret;
7723
7724 flags = check_case_options(argc, argv, flags);
7725 enc = str_true_enc(str);
7726 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str;
7727 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7728 ret = rb_str_new(0, RSTRING_LEN(str));
7729 rb_str_ascii_casemap(str, ret, &flags, enc);
7730 }
7731 else {
7732 ret = rb_str_casemap(str, &flags, enc);
7733 }
7734 return ret;
7735}
7736
7737
7738/*
7739 * call-seq:
7740 * swapcase!(*options) -> self or nil
7741 *
7742 * Upcases each lowercase character in +self+;
7743 * downcases uppercase character;
7744 * returns +self+ if any changes were made, +nil+ otherwise:
7745 *
7746 * s = 'Hello World!' # => "Hello World!"
7747 * s.swapcase! # => "hELLO wORLD!"
7748 * s # => "hELLO wORLD!"
7749 * ''.swapcase! # => nil
7750 *
7751 * The casing may be affected by the given +options+;
7752 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7753 *
7754 * Related: String#swapcase.
7755 *
7756 */
7757
7758static VALUE
7759rb_str_swapcase_bang(int argc, VALUE *argv, VALUE str)
7760{
7761 rb_encoding *enc;
7762 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7763
7764 flags = check_case_options(argc, argv, flags);
7765 str_modify_keep_cr(str);
7766 enc = str_true_enc(str);
7767 if (flags&ONIGENC_CASE_ASCII_ONLY)
7768 rb_str_ascii_casemap(str, str, &flags, enc);
7769 else
7770 str_shared_replace(str, rb_str_casemap(str, &flags, enc));
7771
7772 if (ONIGENC_CASE_MODIFIED&flags) return str;
7773 return Qnil;
7774}
7775
7776
7777/*
7778 * call-seq:
7779 * swapcase(*options) -> string
7780 *
7781 * Returns a string containing the characters in +self+, with cases reversed;
7782 * each uppercase character is downcased;
7783 * each lowercase character is upcased:
7784 *
7785 * s = 'Hello World!' # => "Hello World!"
7786 * s.swapcase # => "hELLO wORLD!"
7787 *
7788 * The casing may be affected by the given +options+;
7789 * see {Case Mapping}[rdoc-ref:case_mapping.rdoc].
7790 *
7791 * Related: String#swapcase!.
7792 *
7793 */
7794
7795static VALUE
7796rb_str_swapcase(int argc, VALUE *argv, VALUE str)
7797{
7798 rb_encoding *enc;
7799 OnigCaseFoldType flags = ONIGENC_CASE_UPCASE | ONIGENC_CASE_DOWNCASE;
7800 VALUE ret;
7801
7802 flags = check_case_options(argc, argv, flags);
7803 enc = str_true_enc(str);
7804 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return str_duplicate(rb_cString, str);
7805 if (flags&ONIGENC_CASE_ASCII_ONLY) {
7806 ret = rb_str_new(0, RSTRING_LEN(str));
7807 rb_str_ascii_casemap(str, ret, &flags, enc);
7808 }
7809 else {
7810 ret = rb_str_casemap(str, &flags, enc);
7811 }
7812 return ret;
7813}
7814
7815typedef unsigned char *USTR;
7816
7817struct tr {
7818 int gen;
7819 unsigned int now, max;
7820 char *p, *pend;
7821};
7822
7823static unsigned int
7824trnext(struct tr *t, rb_encoding *enc)
7825{
7826 int n;
7827
7828 for (;;) {
7829 nextpart:
7830 if (!t->gen) {
7831 if (t->p == t->pend) return -1;
7832 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '\\' && t->p + n < t->pend) {
7833 t->p += n;
7834 }
7835 t->now = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7836 t->p += n;
7837 if (rb_enc_ascget(t->p, t->pend, &n, enc) == '-' && t->p + n < t->pend) {
7838 t->p += n;
7839 if (t->p < t->pend) {
7840 unsigned int c = rb_enc_codepoint_len(t->p, t->pend, &n, enc);
7841 t->p += n;
7842 if (t->now > c) {
7843 if (t->now < 0x80 && c < 0x80) {
7844 rb_raise(rb_eArgError,
7845 "invalid range \"%c-%c\" in string transliteration",
7846 t->now, c);
7847 }
7848 else {
7849 rb_raise(rb_eArgError, "invalid range in string transliteration");
7850 }
7851 continue; /* not reached */
7852 }
7853 else if (t->now < c) {
7854 t->gen = 1;
7855 t->max = c;
7856 }
7857 }
7858 }
7859 return t->now;
7860 }
7861 else {
7862 while (ONIGENC_CODE_TO_MBCLEN(enc, ++t->now) <= 0) {
7863 if (t->now == t->max) {
7864 t->gen = 0;
7865 goto nextpart;
7866 }
7867 }
7868 if (t->now < t->max) {
7869 return t->now;
7870 }
7871 else {
7872 t->gen = 0;
7873 return t->max;
7874 }
7875 }
7876 }
7877}
7878
7879static VALUE rb_str_delete_bang(int,VALUE*,VALUE);
7880
7881static VALUE
7882tr_trans(VALUE str, VALUE src, VALUE repl, int sflag)
7883{
7884 const unsigned int errc = -1;
7885 unsigned int trans[256];
7886 rb_encoding *enc, *e1, *e2;
7887 struct tr trsrc, trrepl;
7888 int cflag = 0;
7889 unsigned int c, c0, last = 0;
7890 int modify = 0, i, l;
7891 unsigned char *s, *send;
7892 VALUE hash = 0;
7893 int singlebyte = single_byte_optimizable(str);
7894 int termlen;
7895 int cr;
7896
7897#define CHECK_IF_ASCII(c) \
7898 (void)((cr == ENC_CODERANGE_7BIT && !rb_isascii(c)) ? \
7899 (cr = ENC_CODERANGE_VALID) : 0)
7900
7901 StringValue(src);
7902 StringValue(repl);
7903 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
7904 if (RSTRING_LEN(repl) == 0) {
7905 return rb_str_delete_bang(1, &src, str);
7906 }
7907
7908 cr = ENC_CODERANGE(str);
7909 e1 = rb_enc_check(str, src);
7910 e2 = rb_enc_check(str, repl);
7911 if (e1 == e2) {
7912 enc = e1;
7913 }
7914 else {
7915 enc = rb_enc_check(src, repl);
7916 }
7917 trsrc.p = RSTRING_PTR(src); trsrc.pend = trsrc.p + RSTRING_LEN(src);
7918 if (RSTRING_LEN(src) > 1 &&
7919 rb_enc_ascget(trsrc.p, trsrc.pend, &l, enc) == '^' &&
7920 trsrc.p + l < trsrc.pend) {
7921 cflag = 1;
7922 trsrc.p += l;
7923 }
7924 trrepl.p = RSTRING_PTR(repl);
7925 trrepl.pend = trrepl.p + RSTRING_LEN(repl);
7926 trsrc.gen = trrepl.gen = 0;
7927 trsrc.now = trrepl.now = 0;
7928 trsrc.max = trrepl.max = 0;
7929
7930 if (cflag) {
7931 for (i=0; i<256; i++) {
7932 trans[i] = 1;
7933 }
7934 while ((c = trnext(&trsrc, enc)) != errc) {
7935 if (c < 256) {
7936 trans[c] = errc;
7937 }
7938 else {
7939 if (!hash) hash = rb_hash_new();
7940 rb_hash_aset(hash, UINT2NUM(c), Qtrue);
7941 }
7942 }
7943 while ((c = trnext(&trrepl, enc)) != errc)
7944 /* retrieve last replacer */;
7945 last = trrepl.now;
7946 for (i=0; i<256; i++) {
7947 if (trans[i] != errc) {
7948 trans[i] = last;
7949 }
7950 }
7951 }
7952 else {
7953 unsigned int r;
7954
7955 for (i=0; i<256; i++) {
7956 trans[i] = errc;
7957 }
7958 while ((c = trnext(&trsrc, enc)) != errc) {
7959 r = trnext(&trrepl, enc);
7960 if (r == errc) r = trrepl.now;
7961 if (c < 256) {
7962 trans[c] = r;
7963 if (rb_enc_codelen(r, enc) != 1) singlebyte = 0;
7964 }
7965 else {
7966 if (!hash) hash = rb_hash_new();
7967 rb_hash_aset(hash, UINT2NUM(c), UINT2NUM(r));
7968 }
7969 }
7970 }
7971
7972 if (cr == ENC_CODERANGE_VALID && rb_enc_asciicompat(e1))
7973 cr = ENC_CODERANGE_7BIT;
7974 str_modify_keep_cr(str);
7975 s = (unsigned char *)RSTRING_PTR(str); send = (unsigned char *)RSTRING_END(str);
7976 termlen = rb_enc_mbminlen(enc);
7977 if (sflag) {
7978 int clen, tlen;
7979 long offset, max = RSTRING_LEN(str);
7980 unsigned int save = -1;
7981 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
7982
7983 while (s < send) {
7984 int may_modify = 0;
7985
7986 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
7987 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
7988
7989 s += clen;
7990 if (c < 256) {
7991 c = trans[c];
7992 }
7993 else if (hash) {
7994 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
7995 if (NIL_P(tmp)) {
7996 if (cflag) c = last;
7997 else c = errc;
7998 }
7999 else if (cflag) c = errc;
8000 else c = NUM2INT(tmp);
8001 }
8002 else {
8003 c = errc;
8004 }
8005 if (c != (unsigned int)-1) {
8006 if (save == c) {
8007 CHECK_IF_ASCII(c);
8008 continue;
8009 }
8010 save = c;
8011 tlen = rb_enc_codelen(c, enc);
8012 modify = 1;
8013 }
8014 else {
8015 save = -1;
8016 c = c0;
8017 if (enc != e1) may_modify = 1;
8018 }
8019 if ((offset = t - buf) + tlen > max) {
8020 size_t MAYBE_UNUSED(old) = max + termlen;
8021 max = offset + tlen + (send - s);
8022 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8023 t = buf + offset;
8024 }
8025 rb_enc_mbcput(c, t, enc);
8026 if (may_modify && memcmp(s, t, tlen) != 0) {
8027 modify = 1;
8028 }
8029 CHECK_IF_ASCII(c);
8030 t += tlen;
8031 }
8032 if (!STR_EMBED_P(str)) {
8033 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8034 }
8035 TERM_FILL((char *)t, termlen);
8036 RSTRING(str)->as.heap.ptr = (char *)buf;
8037 STR_SET_LEN(str, t - buf);
8038 STR_SET_NOEMBED(str);
8039 RSTRING(str)->as.heap.aux.capa = max;
8040 }
8041 else if (rb_enc_mbmaxlen(enc) == 1 || (singlebyte && !hash)) {
8042 while (s < send) {
8043 c = (unsigned char)*s;
8044 if (trans[c] != errc) {
8045 if (!cflag) {
8046 c = trans[c];
8047 *s = c;
8048 modify = 1;
8049 }
8050 else {
8051 *s = last;
8052 modify = 1;
8053 }
8054 }
8055 CHECK_IF_ASCII(c);
8056 s++;
8057 }
8058 }
8059 else {
8060 int clen, tlen;
8061 long offset, max = (long)((send - s) * 1.2);
8062 unsigned char *buf = ALLOC_N(unsigned char, max + termlen), *t = buf;
8063
8064 while (s < send) {
8065 int may_modify = 0;
8066 c0 = c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, e1);
8067 tlen = enc == e1 ? clen : rb_enc_codelen(c, enc);
8068
8069 if (c < 256) {
8070 c = trans[c];
8071 }
8072 else if (hash) {
8073 VALUE tmp = rb_hash_lookup(hash, UINT2NUM(c));
8074 if (NIL_P(tmp)) {
8075 if (cflag) c = last;
8076 else c = errc;
8077 }
8078 else if (cflag) c = errc;
8079 else c = NUM2INT(tmp);
8080 }
8081 else {
8082 c = cflag ? last : errc;
8083 }
8084 if (c != errc) {
8085 tlen = rb_enc_codelen(c, enc);
8086 modify = 1;
8087 }
8088 else {
8089 c = c0;
8090 if (enc != e1) may_modify = 1;
8091 }
8092 if ((offset = t - buf) + tlen > max) {
8093 size_t MAYBE_UNUSED(old) = max + termlen;
8094 max = offset + tlen + (long)((send - s) * 1.2);
8095 SIZED_REALLOC_N(buf, unsigned char, max + termlen, old);
8096 t = buf + offset;
8097 }
8098 if (s != t) {
8099 rb_enc_mbcput(c, t, enc);
8100 if (may_modify && memcmp(s, t, tlen) != 0) {
8101 modify = 1;
8102 }
8103 }
8104 CHECK_IF_ASCII(c);
8105 s += clen;
8106 t += tlen;
8107 }
8108 if (!STR_EMBED_P(str)) {
8109 ruby_sized_xfree(STR_HEAP_PTR(str), STR_HEAP_SIZE(str));
8110 }
8111 TERM_FILL((char *)t, termlen);
8112 RSTRING(str)->as.heap.ptr = (char *)buf;
8113 STR_SET_LEN(str, t - buf);
8114 STR_SET_NOEMBED(str);
8115 RSTRING(str)->as.heap.aux.capa = max;
8116 }
8117
8118 if (modify) {
8119 if (cr != ENC_CODERANGE_BROKEN)
8120 ENC_CODERANGE_SET(str, cr);
8121 rb_enc_associate(str, enc);
8122 return str;
8123 }
8124 return Qnil;
8125}
8126
8127
8128/*
8129 * call-seq:
8130 * tr!(selector, replacements) -> self or nil
8131 *
8132 * Like String#tr, but modifies +self+ in place.
8133 * Returns +self+ if any changes were made, +nil+ otherwise.
8134 *
8135 */
8136
8137static VALUE
8138rb_str_tr_bang(VALUE str, VALUE src, VALUE repl)
8139{
8140 return tr_trans(str, src, repl, 0);
8141}
8142
8143
8144/*
8145 * call-seq:
8146 * tr(selector, replacements) -> new_string
8147 *
8148 * Returns a copy of +self+ with each character specified by string +selector+
8149 * translated to the corresponding character in string +replacements+.
8150 * The correspondence is _positional_:
8151 *
8152 * - Each occurrence of the first character specified by +selector+
8153 * is translated to the first character in +replacements+.
8154 * - Each occurrence of the second character specified by +selector+
8155 * is translated to the second character in +replacements+.
8156 * - And so on.
8157 *
8158 * Example:
8159 *
8160 * 'hello'.tr('el', 'ip') #=> "hippo"
8161 *
8162 * If +replacements+ is shorter than +selector+,
8163 * it is implicitly padded with its own last character:
8164 *
8165 * 'hello'.tr('aeiou', '-') # => "h-ll-"
8166 * 'hello'.tr('aeiou', 'AA-') # => "hAll-"
8167 *
8168 * Arguments +selector+ and +replacements+ must be valid character selectors
8169 * (see {Character Selectors}[rdoc-ref:character_selectors.rdoc]),
8170 * and may use any of its valid forms, including negation, ranges, and escaping:
8171 *
8172 * # Negation.
8173 * 'hello'.tr('^aeiou', '-') # => "-e--o"
8174 * # Ranges.
8175 * 'ibm'.tr('b-z', 'a-z') # => "hal"
8176 * # Escapes.
8177 * 'hel^lo'.tr('\^aeiou', '-') # => "h-l-l-" # Escaped leading caret.
8178 * 'i-b-m'.tr('b\-z', 'a-z') # => "ibabm" # Escaped embedded hyphen.
8179 * 'foo\\bar'.tr('ab\\', 'XYZ') # => "fooZYXr" # Escaped backslash.
8180 *
8181 */
8182
8183static VALUE
8184rb_str_tr(VALUE str, VALUE src, VALUE repl)
8185{
8186 str = str_duplicate(rb_cString, str);
8187 tr_trans(str, src, repl, 0);
8188 return str;
8189}
8190
8191#define TR_TABLE_MAX (UCHAR_MAX+1)
8192#define TR_TABLE_SIZE (TR_TABLE_MAX+1)
8193static void
8194tr_setup_table(VALUE str, char stable[TR_TABLE_SIZE], int first,
8195 VALUE *tablep, VALUE *ctablep, rb_encoding *enc)
8196{
8197 const unsigned int errc = -1;
8198 char buf[TR_TABLE_MAX];
8199 struct tr tr;
8200 unsigned int c;
8201 VALUE table = 0, ptable = 0;
8202 int i, l, cflag = 0;
8203
8204 tr.p = RSTRING_PTR(str); tr.pend = tr.p + RSTRING_LEN(str);
8205 tr.gen = tr.now = tr.max = 0;
8206
8207 if (RSTRING_LEN(str) > 1 && rb_enc_ascget(tr.p, tr.pend, &l, enc) == '^') {
8208 cflag = 1;
8209 tr.p += l;
8210 }
8211 if (first) {
8212 for (i=0; i<TR_TABLE_MAX; i++) {
8213 stable[i] = 1;
8214 }
8215 stable[TR_TABLE_MAX] = cflag;
8216 }
8217 else if (stable[TR_TABLE_MAX] && !cflag) {
8218 stable[TR_TABLE_MAX] = 0;
8219 }
8220 for (i=0; i<TR_TABLE_MAX; i++) {
8221 buf[i] = cflag;
8222 }
8223
8224 while ((c = trnext(&tr, enc)) != errc) {
8225 if (c < TR_TABLE_MAX) {
8226 buf[(unsigned char)c] = !cflag;
8227 }
8228 else {
8229 VALUE key = UINT2NUM(c);
8230
8231 if (!table && (first || *tablep || stable[TR_TABLE_MAX])) {
8232 if (cflag) {
8233 ptable = *ctablep;
8234 table = ptable ? ptable : rb_hash_new();
8235 *ctablep = table;
8236 }
8237 else {
8238 table = rb_hash_new();
8239 ptable = *tablep;
8240 *tablep = table;
8241 }
8242 }
8243 if (table && (!ptable || (cflag ^ !NIL_P(rb_hash_aref(ptable, key))))) {
8244 rb_hash_aset(table, key, Qtrue);
8245 }
8246 }
8247 }
8248 for (i=0; i<TR_TABLE_MAX; i++) {
8249 stable[i] = stable[i] && buf[i];
8250 }
8251 if (!table && !cflag) {
8252 *tablep = 0;
8253 }
8254}
8255
8256
8257static int
8258tr_find(unsigned int c, const char table[TR_TABLE_SIZE], VALUE del, VALUE nodel)
8259{
8260 if (c < TR_TABLE_MAX) {
8261 return table[c] != 0;
8262 }
8263 else {
8264 VALUE v = UINT2NUM(c);
8265
8266 if (del) {
8267 if (!NIL_P(rb_hash_lookup(del, v)) &&
8268 (!nodel || NIL_P(rb_hash_lookup(nodel, v)))) {
8269 return TRUE;
8270 }
8271 }
8272 else if (nodel && !NIL_P(rb_hash_lookup(nodel, v))) {
8273 return FALSE;
8274 }
8275 return table[TR_TABLE_MAX] ? TRUE : FALSE;
8276 }
8277}
8278
8279/*
8280 * call-seq:
8281 * delete!(*selectors) -> self or nil
8282 *
8283 * Like String#delete, but modifies +self+ in place.
8284 * Returns +self+ if any changes were made, +nil+ otherwise.
8285 *
8286 */
8287
8288static VALUE
8289rb_str_delete_bang(int argc, VALUE *argv, VALUE str)
8290{
8291 char squeez[TR_TABLE_SIZE];
8292 rb_encoding *enc = 0;
8293 char *s, *send, *t;
8294 VALUE del = 0, nodel = 0;
8295 int modify = 0;
8296 int i, ascompat, cr;
8297
8298 if (RSTRING_LEN(str) == 0 || !RSTRING_PTR(str)) return Qnil;
8300 for (i=0; i<argc; i++) {
8301 VALUE s = argv[i];
8302
8303 StringValue(s);
8304 enc = rb_enc_check(str, s);
8305 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8306 }
8307
8308 str_modify_keep_cr(str);
8309 ascompat = rb_enc_asciicompat(enc);
8310 s = t = RSTRING_PTR(str);
8311 send = RSTRING_END(str);
8312 cr = ascompat ? ENC_CODERANGE_7BIT : ENC_CODERANGE_VALID;
8313 while (s < send) {
8314 unsigned int c;
8315 int clen;
8316
8317 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8318 if (squeez[c]) {
8319 modify = 1;
8320 }
8321 else {
8322 if (t != s) *t = c;
8323 t++;
8324 }
8325 s++;
8326 }
8327 else {
8328 c = rb_enc_codepoint_len(s, send, &clen, enc);
8329
8330 if (tr_find(c, squeez, del, nodel)) {
8331 modify = 1;
8332 }
8333 else {
8334 if (t != s) rb_enc_mbcput(c, t, enc);
8335 t += clen;
8337 }
8338 s += clen;
8339 }
8340 }
8341 TERM_FILL(t, TERM_LEN(str));
8342 STR_SET_LEN(str, t - RSTRING_PTR(str));
8343 ENC_CODERANGE_SET(str, cr);
8344
8345 if (modify) return str;
8346 return Qnil;
8347}
8348
8349
8350/*
8351 * call-seq:
8352 * delete(*selectors) -> new_string
8353 *
8354 * Returns a copy of +self+ with characters specified by +selectors+ removed
8355 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8356 *
8357 * "hello".delete "l","lo" #=> "heo"
8358 * "hello".delete "lo" #=> "he"
8359 * "hello".delete "aeiou", "^e" #=> "hell"
8360 * "hello".delete "ej-m" #=> "ho"
8361 *
8362 */
8363
8364static VALUE
8365rb_str_delete(int argc, VALUE *argv, VALUE str)
8366{
8367 str = str_duplicate(rb_cString, str);
8368 rb_str_delete_bang(argc, argv, str);
8369 return str;
8370}
8371
8372
8373/*
8374 * call-seq:
8375 * squeeze!(*selectors) -> self or nil
8376 *
8377 * Like String#squeeze, but modifies +self+ in place.
8378 * Returns +self+ if any changes were made, +nil+ otherwise.
8379 */
8380
8381static VALUE
8382rb_str_squeeze_bang(int argc, VALUE *argv, VALUE str)
8383{
8384 char squeez[TR_TABLE_SIZE];
8385 rb_encoding *enc = 0;
8386 VALUE del = 0, nodel = 0;
8387 unsigned char *s, *send, *t;
8388 int i, modify = 0;
8389 int ascompat, singlebyte = single_byte_optimizable(str);
8390 unsigned int save;
8391
8392 if (argc == 0) {
8393 enc = STR_ENC_GET(str);
8394 }
8395 else {
8396 for (i=0; i<argc; i++) {
8397 VALUE s = argv[i];
8398
8399 StringValue(s);
8400 enc = rb_enc_check(str, s);
8401 if (singlebyte && !single_byte_optimizable(s))
8402 singlebyte = 0;
8403 tr_setup_table(s, squeez, i==0, &del, &nodel, enc);
8404 }
8405 }
8406
8407 str_modify_keep_cr(str);
8408 s = t = (unsigned char *)RSTRING_PTR(str);
8409 if (!s || RSTRING_LEN(str) == 0) return Qnil;
8410 send = (unsigned char *)RSTRING_END(str);
8411 save = -1;
8412 ascompat = rb_enc_asciicompat(enc);
8413
8414 if (singlebyte) {
8415 while (s < send) {
8416 unsigned int c = *s++;
8417 if (c != save || (argc > 0 && !squeez[c])) {
8418 *t++ = save = c;
8419 }
8420 }
8421 }
8422 else {
8423 while (s < send) {
8424 unsigned int c;
8425 int clen;
8426
8427 if (ascompat && (c = *s) < 0x80) {
8428 if (c != save || (argc > 0 && !squeez[c])) {
8429 *t++ = save = c;
8430 }
8431 s++;
8432 }
8433 else {
8434 c = rb_enc_codepoint_len((char *)s, (char *)send, &clen, enc);
8435
8436 if (c != save || (argc > 0 && !tr_find(c, squeez, del, nodel))) {
8437 if (t != s) rb_enc_mbcput(c, t, enc);
8438 save = c;
8439 t += clen;
8440 }
8441 s += clen;
8442 }
8443 }
8444 }
8445
8446 TERM_FILL((char *)t, TERM_LEN(str));
8447 if ((char *)t - RSTRING_PTR(str) != RSTRING_LEN(str)) {
8448 STR_SET_LEN(str, (char *)t - RSTRING_PTR(str));
8449 modify = 1;
8450 }
8451
8452 if (modify) return str;
8453 return Qnil;
8454}
8455
8456
8457/*
8458 * call-seq:
8459 * squeeze(*selectors) -> new_string
8460 *
8461 * Returns a copy of +self+ with characters specified by +selectors+ "squeezed"
8462 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8463 *
8464 * "Squeezed" means that each multiple-character run of a selected character
8465 * is squeezed down to a single character;
8466 * with no arguments given, squeezes all characters:
8467 *
8468 * "yellow moon".squeeze #=> "yelow mon"
8469 * " now is the".squeeze(" ") #=> " now is the"
8470 * "putters shoot balls".squeeze("m-z") #=> "puters shot balls"
8471 *
8472 */
8473
8474static VALUE
8475rb_str_squeeze(int argc, VALUE *argv, VALUE str)
8476{
8477 str = str_duplicate(rb_cString, str);
8478 rb_str_squeeze_bang(argc, argv, str);
8479 return str;
8480}
8481
8482
8483/*
8484 * call-seq:
8485 * tr_s!(selector, replacements) -> self or nil
8486 *
8487 * Like String#tr_s, but modifies +self+ in place.
8488 * Returns +self+ if any changes were made, +nil+ otherwise.
8489 *
8490 * Related: String#squeeze!.
8491 */
8492
8493static VALUE
8494rb_str_tr_s_bang(VALUE str, VALUE src, VALUE repl)
8495{
8496 return tr_trans(str, src, repl, 1);
8497}
8498
8499
8500/*
8501 * call-seq:
8502 * tr_s(selector, replacements) -> string
8503 *
8504 * Like String#tr, but also squeezes the modified portions of the translated string;
8505 * returns a new string (translated and squeezed).
8506 *
8507 * 'hello'.tr_s('l', 'r') #=> "hero"
8508 * 'hello'.tr_s('el', '-') #=> "h-o"
8509 * 'hello'.tr_s('el', 'hx') #=> "hhxo"
8510 *
8511 * Related: String#squeeze.
8512 *
8513 */
8514
8515static VALUE
8516rb_str_tr_s(VALUE str, VALUE src, VALUE repl)
8517{
8518 str = str_duplicate(rb_cString, str);
8519 tr_trans(str, src, repl, 1);
8520 return str;
8521}
8522
8523
8524/*
8525 * call-seq:
8526 * count(*selectors) -> integer
8527 *
8528 * Returns the total number of characters in +self+
8529 * that are specified by the given +selectors+
8530 * (see {Multiple Character Selectors}[rdoc-ref:character_selectors.rdoc@Multiple+Character+Selectors]):
8531 *
8532 * a = "hello world"
8533 * a.count "lo" #=> 5
8534 * a.count "lo", "o" #=> 2
8535 * a.count "hello", "^l" #=> 4
8536 * a.count "ej-m" #=> 4
8537 *
8538 * "hello^world".count "\\^aeiou" #=> 4
8539 * "hello-world".count "a\\-eo" #=> 4
8540 *
8541 * c = "hello world\\r\\n"
8542 * c.count "\\" #=> 2
8543 * c.count "\\A" #=> 0
8544 * c.count "X-\\w" #=> 3
8545 */
8546
8547static VALUE
8548rb_str_count(int argc, VALUE *argv, VALUE str)
8549{
8550 char table[TR_TABLE_SIZE];
8551 rb_encoding *enc = 0;
8552 VALUE del = 0, nodel = 0, tstr;
8553 char *s, *send;
8554 int i;
8555 int ascompat;
8556 size_t n = 0;
8557
8559
8560 tstr = argv[0];
8561 StringValue(tstr);
8562 enc = rb_enc_check(str, tstr);
8563 if (argc == 1) {
8564 const char *ptstr;
8565 if (RSTRING_LEN(tstr) == 1 && rb_enc_asciicompat(enc) &&
8566 (ptstr = RSTRING_PTR(tstr),
8567 ONIGENC_IS_ALLOWED_REVERSE_MATCH(enc, (const unsigned char *)ptstr, (const unsigned char *)ptstr+1)) &&
8568 !is_broken_string(str)) {
8569 int clen;
8570 unsigned char c = rb_enc_codepoint_len(ptstr, ptstr+1, &clen, enc);
8571
8572 s = RSTRING_PTR(str);
8573 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8574 send = RSTRING_END(str);
8575 while (s < send) {
8576 if (*(unsigned char*)s++ == c) n++;
8577 }
8578 return SIZET2NUM(n);
8579 }
8580 }
8581
8582 tr_setup_table(tstr, table, TRUE, &del, &nodel, enc);
8583 for (i=1; i<argc; i++) {
8584 tstr = argv[i];
8585 StringValue(tstr);
8586 enc = rb_enc_check(str, tstr);
8587 tr_setup_table(tstr, table, FALSE, &del, &nodel, enc);
8588 }
8589
8590 s = RSTRING_PTR(str);
8591 if (!s || RSTRING_LEN(str) == 0) return INT2FIX(0);
8592 send = RSTRING_END(str);
8593 ascompat = rb_enc_asciicompat(enc);
8594 while (s < send) {
8595 unsigned int c;
8596
8597 if (ascompat && (c = *(unsigned char*)s) < 0x80) {
8598 if (table[c]) {
8599 n++;
8600 }
8601 s++;
8602 }
8603 else {
8604 int clen;
8605 c = rb_enc_codepoint_len(s, send, &clen, enc);
8606 if (tr_find(c, table, del, nodel)) {
8607 n++;
8608 }
8609 s += clen;
8610 }
8611 }
8612
8613 return SIZET2NUM(n);
8614}
8615
8616static VALUE
8617rb_fs_check(VALUE val)
8618{
8619 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING) && !RB_TYPE_P(val, T_REGEXP)) {
8620 val = rb_check_string_type(val);
8621 if (NIL_P(val)) return 0;
8622 }
8623 return val;
8624}
8625
8626static const char isspacetable[256] = {
8627 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0,
8628 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8629 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8630 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8631 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8632 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8633 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8634 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8635 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8636 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8637 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8638 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8639 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8640 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8641 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
8642 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
8643};
8644
8645#define ascii_isspace(c) isspacetable[(unsigned char)(c)]
8646
8647static long
8648split_string(VALUE result, VALUE str, long beg, long len, long empty_count)
8649{
8650 if (empty_count >= 0 && len == 0) {
8651 return empty_count + 1;
8652 }
8653 if (empty_count > 0) {
8654 /* make different substrings */
8655 if (result) {
8656 do {
8657 rb_ary_push(result, str_new_empty_String(str));
8658 } while (--empty_count > 0);
8659 }
8660 else {
8661 do {
8662 rb_yield(str_new_empty_String(str));
8663 } while (--empty_count > 0);
8664 }
8665 }
8666 str = rb_str_subseq(str, beg, len);
8667 if (result) {
8668 rb_ary_push(result, str);
8669 }
8670 else {
8671 rb_yield(str);
8672 }
8673 return empty_count;
8674}
8675
8676typedef enum {
8677 SPLIT_TYPE_AWK, SPLIT_TYPE_STRING, SPLIT_TYPE_REGEXP, SPLIT_TYPE_CHARS
8678} split_type_t;
8679
8680static split_type_t
8681literal_split_pattern(VALUE spat, split_type_t default_type)
8682{
8683 rb_encoding *enc = STR_ENC_GET(spat);
8684 const char *ptr;
8685 long len;
8686 RSTRING_GETMEM(spat, ptr, len);
8687 if (len == 0) {
8688 /* Special case - split into chars */
8689 return SPLIT_TYPE_CHARS;
8690 }
8691 else if (rb_enc_asciicompat(enc)) {
8692 if (len == 1 && ptr[0] == ' ') {
8693 return SPLIT_TYPE_AWK;
8694 }
8695 }
8696 else {
8697 int l;
8698 if (rb_enc_ascget(ptr, ptr + len, &l, enc) == ' ' && len == l) {
8699 return SPLIT_TYPE_AWK;
8700 }
8701 }
8702 return default_type;
8703}
8704
8705/*
8706 * call-seq:
8707 * split(field_sep = $;, limit = nil) -> array
8708 * split(field_sep = $;, limit = nil) {|substring| ... } -> self
8709 *
8710 * :include: doc/string/split.rdoc
8711 *
8712 */
8713
8714static VALUE
8715rb_str_split_m(int argc, VALUE *argv, VALUE str)
8716{
8717 rb_encoding *enc;
8718 VALUE spat;
8719 VALUE limit;
8720 split_type_t split_type;
8721 long beg, end, i = 0, empty_count = -1;
8722 int lim = 0;
8723 VALUE result, tmp;
8724
8725 result = rb_block_given_p() ? Qfalse : Qnil;
8726 if (rb_scan_args(argc, argv, "02", &spat, &limit) == 2) {
8727 lim = NUM2INT(limit);
8728 if (lim <= 0) limit = Qnil;
8729 else if (lim == 1) {
8730 if (RSTRING_LEN(str) == 0)
8731 return result ? rb_ary_new2(0) : str;
8732 tmp = str_duplicate(rb_cString, str);
8733 if (!result) {
8734 rb_yield(tmp);
8735 return str;
8736 }
8737 return rb_ary_new3(1, tmp);
8738 }
8739 i = 1;
8740 }
8741 if (NIL_P(limit) && !lim) empty_count = 0;
8742
8743 enc = STR_ENC_GET(str);
8744 split_type = SPLIT_TYPE_REGEXP;
8745 if (!NIL_P(spat)) {
8746 spat = get_pat_quoted(spat, 0);
8747 }
8748 else if (NIL_P(spat = rb_fs)) {
8749 split_type = SPLIT_TYPE_AWK;
8750 }
8751 else if (!(spat = rb_fs_check(spat))) {
8752 rb_raise(rb_eTypeError, "value of $; must be String or Regexp");
8753 }
8754 else {
8755 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$; is set to non-nil value");
8756 }
8757 if (split_type != SPLIT_TYPE_AWK) {
8758 switch (BUILTIN_TYPE(spat)) {
8759 case T_REGEXP:
8760 rb_reg_options(spat); /* check if uninitialized */
8761 tmp = RREGEXP_SRC(spat);
8762 split_type = literal_split_pattern(tmp, SPLIT_TYPE_REGEXP);
8763 if (split_type == SPLIT_TYPE_AWK) {
8764 spat = tmp;
8765 split_type = SPLIT_TYPE_STRING;
8766 }
8767 break;
8768
8769 case T_STRING:
8770 mustnot_broken(spat);
8771 split_type = literal_split_pattern(spat, SPLIT_TYPE_STRING);
8772 break;
8773
8774 default:
8776 }
8777 }
8778
8779#define SPLIT_STR(beg, len) (empty_count = split_string(result, str, beg, len, empty_count))
8780
8781 beg = 0;
8782 char *ptr = RSTRING_PTR(str);
8783 char *eptr = RSTRING_END(str);
8784 if (split_type == SPLIT_TYPE_AWK) {
8785 char *bptr = ptr;
8786 int skip = 1;
8787 unsigned int c;
8788
8789 if (result) result = rb_ary_new();
8790 end = beg;
8791 if (is_ascii_string(str)) {
8792 while (ptr < eptr) {
8793 c = (unsigned char)*ptr++;
8794 if (skip) {
8795 if (ascii_isspace(c)) {
8796 beg = ptr - bptr;
8797 }
8798 else {
8799 end = ptr - bptr;
8800 skip = 0;
8801 if (!NIL_P(limit) && lim <= i) break;
8802 }
8803 }
8804 else if (ascii_isspace(c)) {
8805 SPLIT_STR(beg, end-beg);
8806 skip = 1;
8807 beg = ptr - bptr;
8808 if (!NIL_P(limit)) ++i;
8809 }
8810 else {
8811 end = ptr - bptr;
8812 }
8813 }
8814 }
8815 else {
8816 while (ptr < eptr) {
8817 int n;
8818
8819 c = rb_enc_codepoint_len(ptr, eptr, &n, enc);
8820 ptr += n;
8821 if (skip) {
8822 if (rb_isspace(c)) {
8823 beg = ptr - bptr;
8824 }
8825 else {
8826 end = ptr - bptr;
8827 skip = 0;
8828 if (!NIL_P(limit) && lim <= i) break;
8829 }
8830 }
8831 else if (rb_isspace(c)) {
8832 SPLIT_STR(beg, end-beg);
8833 skip = 1;
8834 beg = ptr - bptr;
8835 if (!NIL_P(limit)) ++i;
8836 }
8837 else {
8838 end = ptr - bptr;
8839 }
8840 }
8841 }
8842 }
8843 else if (split_type == SPLIT_TYPE_STRING) {
8844 char *str_start = ptr;
8845 char *substr_start = ptr;
8846 char *sptr = RSTRING_PTR(spat);
8847 long slen = RSTRING_LEN(spat);
8848
8849 if (result) result = rb_ary_new();
8850 mustnot_broken(str);
8851 enc = rb_enc_check(str, spat);
8852 while (ptr < eptr &&
8853 (end = rb_memsearch(sptr, slen, ptr, eptr - ptr, enc)) >= 0) {
8854 /* Check we are at the start of a char */
8855 char *t = rb_enc_right_char_head(ptr, ptr + end, eptr, enc);
8856 if (t != ptr + end) {
8857 ptr = t;
8858 continue;
8859 }
8860 SPLIT_STR(substr_start - str_start, (ptr+end) - substr_start);
8861 ptr += end + slen;
8862 substr_start = ptr;
8863 if (!NIL_P(limit) && lim <= ++i) break;
8864 }
8865 beg = ptr - str_start;
8866 }
8867 else if (split_type == SPLIT_TYPE_CHARS) {
8868 char *str_start = ptr;
8869 int n;
8870
8871 if (result) result = rb_ary_new_capa(RSTRING_LEN(str));
8872 mustnot_broken(str);
8873 enc = rb_enc_get(str);
8874 while (ptr < eptr &&
8875 (n = rb_enc_precise_mbclen(ptr, eptr, enc)) > 0) {
8876 SPLIT_STR(ptr - str_start, n);
8877 ptr += n;
8878 if (!NIL_P(limit) && lim <= ++i) break;
8879 }
8880 beg = ptr - str_start;
8881 }
8882 else {
8883 if (result) result = rb_ary_new();
8884 long len = RSTRING_LEN(str);
8885 long start = beg;
8886 long idx;
8887 int last_null = 0;
8888 struct re_registers *regs;
8889 VALUE match = 0;
8890
8891 for (; rb_reg_search(spat, str, start, 0) >= 0;
8892 (match ? (rb_match_unbusy(match), rb_backref_set(match)) : (void)0)) {
8893 match = rb_backref_get();
8894 if (!result) rb_match_busy(match);
8895 regs = RMATCH_REGS(match);
8896 end = BEG(0);
8897 if (start == end && BEG(0) == END(0)) {
8898 if (!ptr) {
8899 SPLIT_STR(0, 0);
8900 break;
8901 }
8902 else if (last_null == 1) {
8903 SPLIT_STR(beg, rb_enc_fast_mbclen(ptr+beg, eptr, enc));
8904 beg = start;
8905 }
8906 else {
8907 if (start == len)
8908 start++;
8909 else
8910 start += rb_enc_fast_mbclen(ptr+start,eptr,enc);
8911 last_null = 1;
8912 continue;
8913 }
8914 }
8915 else {
8916 SPLIT_STR(beg, end-beg);
8917 beg = start = END(0);
8918 }
8919 last_null = 0;
8920
8921 for (idx=1; idx < regs->num_regs; idx++) {
8922 if (BEG(idx) == -1) continue;
8923 SPLIT_STR(BEG(idx), END(idx)-BEG(idx));
8924 }
8925 if (!NIL_P(limit) && lim <= ++i) break;
8926 }
8927 if (match) rb_match_unbusy(match);
8928 }
8929 if (RSTRING_LEN(str) > 0 && (!NIL_P(limit) || RSTRING_LEN(str) > beg || lim < 0)) {
8930 SPLIT_STR(beg, RSTRING_LEN(str)-beg);
8931 }
8932
8933 return result ? result : str;
8934}
8935
8936VALUE
8937rb_str_split(VALUE str, const char *sep0)
8938{
8939 VALUE sep;
8940
8941 StringValue(str);
8942 sep = rb_str_new_cstr(sep0);
8943 return rb_str_split_m(1, &sep, str);
8944}
8945
8946#define WANTARRAY(m, size) (!rb_block_given_p() ? rb_ary_new_capa(size) : 0)
8947
8948static inline int
8949enumerator_element(VALUE ary, VALUE e)
8950{
8951 if (ary) {
8952 rb_ary_push(ary, e);
8953 return 0;
8954 }
8955 else {
8956 rb_yield(e);
8957 return 1;
8958 }
8959}
8960
8961#define ENUM_ELEM(ary, e) enumerator_element(ary, e)
8962
8963static const char *
8964chomp_newline(const char *p, const char *e, rb_encoding *enc)
8965{
8966 const char *prev = rb_enc_prev_char(p, e, e, enc);
8967 if (rb_enc_is_newline(prev, e, enc)) {
8968 e = prev;
8969 prev = rb_enc_prev_char(p, e, e, enc);
8970 if (prev && rb_enc_ascget(prev, e, NULL, enc) == '\r')
8971 e = prev;
8972 }
8973 return e;
8974}
8975
8976static VALUE
8977get_rs(void)
8978{
8979 VALUE rs = rb_rs;
8980 if (!NIL_P(rs) &&
8981 (!RB_TYPE_P(rs, T_STRING) ||
8982 RSTRING_LEN(rs) != 1 ||
8983 RSTRING_PTR(rs)[0] != '\n')) {
8984 rb_category_warn(RB_WARN_CATEGORY_DEPRECATED, "$/ is set to non-default value");
8985 }
8986 return rs;
8987}
8988
8989#define rb_rs get_rs()
8990
8991static VALUE
8992rb_str_enumerate_lines(int argc, VALUE *argv, VALUE str, VALUE ary)
8993{
8994 rb_encoding *enc;
8995 VALUE line, rs, orig = str, opts = Qnil, chomp = Qfalse;
8996 const char *ptr, *pend, *subptr, *subend, *rsptr, *hit, *adjusted;
8997 long pos, len, rslen;
8998 int rsnewline = 0;
8999
9000 if (rb_scan_args(argc, argv, "01:", &rs, &opts) == 0)
9001 rs = rb_rs;
9002 if (!NIL_P(opts)) {
9003 static ID keywords[1];
9004 if (!keywords[0]) {
9005 keywords[0] = rb_intern_const("chomp");
9006 }
9007 rb_get_kwargs(opts, keywords, 0, 1, &chomp);
9008 chomp = (!UNDEF_P(chomp) && RTEST(chomp));
9009 }
9010
9011 if (NIL_P(rs)) {
9012 if (!ENUM_ELEM(ary, str)) {
9013 return ary;
9014 }
9015 else {
9016 return orig;
9017 }
9018 }
9019
9020 if (!RSTRING_LEN(str)) goto end;
9021 str = rb_str_new_frozen(str);
9022 ptr = subptr = RSTRING_PTR(str);
9023 pend = RSTRING_END(str);
9024 len = RSTRING_LEN(str);
9025 StringValue(rs);
9026 rslen = RSTRING_LEN(rs);
9027
9028 if (rs == rb_default_rs)
9029 enc = rb_enc_get(str);
9030 else
9031 enc = rb_enc_check(str, rs);
9032
9033 if (rslen == 0) {
9034 /* paragraph mode */
9035 int n;
9036 const char *eol = NULL;
9037 subend = subptr;
9038 while (subend < pend) {
9039 long chomp_rslen = 0;
9040 do {
9041 if (rb_enc_ascget(subend, pend, &n, enc) != '\r')
9042 n = 0;
9043 rslen = n + rb_enc_mbclen(subend + n, pend, enc);
9044 if (rb_enc_is_newline(subend + n, pend, enc)) {
9045 if (eol == subend) break;
9046 subend += rslen;
9047 if (subptr) {
9048 eol = subend;
9049 chomp_rslen = -rslen;
9050 }
9051 }
9052 else {
9053 if (!subptr) subptr = subend;
9054 subend += rslen;
9055 }
9056 rslen = 0;
9057 } while (subend < pend);
9058 if (!subptr) break;
9059 if (rslen == 0) chomp_rslen = 0;
9060 line = rb_str_subseq(str, subptr - ptr,
9061 subend - subptr + (chomp ? chomp_rslen : rslen));
9062 if (ENUM_ELEM(ary, line)) {
9063 str_mod_check(str, ptr, len);
9064 }
9065 subptr = eol = NULL;
9066 }
9067 goto end;
9068 }
9069 else {
9070 rsptr = RSTRING_PTR(rs);
9071 if (RSTRING_LEN(rs) == rb_enc_mbminlen(enc) &&
9072 rb_enc_is_newline(rsptr, rsptr + RSTRING_LEN(rs), enc)) {
9073 rsnewline = 1;
9074 }
9075 }
9076
9077 if ((rs == rb_default_rs) && !rb_enc_asciicompat(enc)) {
9078 rs = rb_str_new(rsptr, rslen);
9079 rs = rb_str_encode(rs, rb_enc_from_encoding(enc), 0, Qnil);
9080 rsptr = RSTRING_PTR(rs);
9081 rslen = RSTRING_LEN(rs);
9082 }
9083
9084 while (subptr < pend) {
9085 pos = rb_memsearch(rsptr, rslen, subptr, pend - subptr, enc);
9086 if (pos < 0) break;
9087 hit = subptr + pos;
9088 adjusted = rb_enc_right_char_head(subptr, hit, pend, enc);
9089 if (hit != adjusted) {
9090 subptr = adjusted;
9091 continue;
9092 }
9093 subend = hit += rslen;
9094 if (chomp) {
9095 if (rsnewline) {
9096 subend = chomp_newline(subptr, subend, enc);
9097 }
9098 else {
9099 subend -= rslen;
9100 }
9101 }
9102 line = rb_str_subseq(str, subptr - ptr, subend - subptr);
9103 if (ENUM_ELEM(ary, line)) {
9104 str_mod_check(str, ptr, len);
9105 }
9106 subptr = hit;
9107 }
9108
9109 if (subptr != pend) {
9110 if (chomp) {
9111 if (rsnewline) {
9112 pend = chomp_newline(subptr, pend, enc);
9113 }
9114 else if (pend - subptr >= rslen &&
9115 memcmp(pend - rslen, rsptr, rslen) == 0) {
9116 pend -= rslen;
9117 }
9118 }
9119 line = rb_str_subseq(str, subptr - ptr, pend - subptr);
9120 ENUM_ELEM(ary, line);
9121 RB_GC_GUARD(str);
9122 }
9123
9124 end:
9125 if (ary)
9126 return ary;
9127 else
9128 return orig;
9129}
9130
9131/*
9132 * call-seq:
9133 * each_line(line_sep = $/, chomp: false) {|substring| ... } -> self
9134 * each_line(line_sep = $/, chomp: false) -> enumerator
9135 *
9136 * :include: doc/string/each_line.rdoc
9137 *
9138 */
9139
9140static VALUE
9141rb_str_each_line(int argc, VALUE *argv, VALUE str)
9142{
9143 RETURN_SIZED_ENUMERATOR(str, argc, argv, 0);
9144 return rb_str_enumerate_lines(argc, argv, str, 0);
9145}
9146
9147/*
9148 * call-seq:
9149 * lines(Line_sep = $/, chomp: false) -> array_of_strings
9150 *
9151 * Forms substrings ("lines") of +self+ according to the given arguments
9152 * (see String#each_line for details); returns the lines in an array.
9153 *
9154 */
9155
9156static VALUE
9157rb_str_lines(int argc, VALUE *argv, VALUE str)
9158{
9159 VALUE ary = WANTARRAY("lines", 0);
9160 return rb_str_enumerate_lines(argc, argv, str, ary);
9161}
9162
9163static VALUE
9164rb_str_each_byte_size(VALUE str, VALUE args, VALUE eobj)
9165{
9166 return LONG2FIX(RSTRING_LEN(str));
9167}
9168
9169static VALUE
9170rb_str_enumerate_bytes(VALUE str, VALUE ary)
9171{
9172 long i;
9173
9174 for (i=0; i<RSTRING_LEN(str); i++) {
9175 ENUM_ELEM(ary, INT2FIX((unsigned char)RSTRING_PTR(str)[i]));
9176 }
9177 if (ary)
9178 return ary;
9179 else
9180 return str;
9181}
9182
9183/*
9184 * call-seq:
9185 * each_byte {|byte| ... } -> self
9186 * each_byte -> enumerator
9187 *
9188 * :include: doc/string/each_byte.rdoc
9189 *
9190 */
9191
9192static VALUE
9193rb_str_each_byte(VALUE str)
9194{
9195 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_byte_size);
9196 return rb_str_enumerate_bytes(str, 0);
9197}
9198
9199/*
9200 * call-seq:
9201 * bytes -> array_of_bytes
9202 *
9203 * :include: doc/string/bytes.rdoc
9204 *
9205 */
9206
9207static VALUE
9208rb_str_bytes(VALUE str)
9209{
9210 VALUE ary = WANTARRAY("bytes", RSTRING_LEN(str));
9211 return rb_str_enumerate_bytes(str, ary);
9212}
9213
9214static VALUE
9215rb_str_each_char_size(VALUE str, VALUE args, VALUE eobj)
9216{
9217 return rb_str_length(str);
9218}
9219
9220static VALUE
9221rb_str_enumerate_chars(VALUE str, VALUE ary)
9222{
9223 VALUE orig = str;
9224 long i, len, n;
9225 const char *ptr;
9226 rb_encoding *enc;
9227
9228 str = rb_str_new_frozen(str);
9229 ptr = RSTRING_PTR(str);
9230 len = RSTRING_LEN(str);
9231 enc = rb_enc_get(str);
9232
9234 for (i = 0; i < len; i += n) {
9235 n = rb_enc_fast_mbclen(ptr + i, ptr + len, enc);
9236 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9237 }
9238 }
9239 else {
9240 for (i = 0; i < len; i += n) {
9241 n = rb_enc_mbclen(ptr + i, ptr + len, enc);
9242 ENUM_ELEM(ary, rb_str_subseq(str, i, n));
9243 }
9244 }
9245 RB_GC_GUARD(str);
9246 if (ary)
9247 return ary;
9248 else
9249 return orig;
9250}
9251
9252/*
9253 * call-seq:
9254 * each_char {|c| ... } -> self
9255 * each_char -> enumerator
9256 *
9257 * :include: doc/string/each_char.rdoc
9258 *
9259 */
9260
9261static VALUE
9262rb_str_each_char(VALUE str)
9263{
9264 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9265 return rb_str_enumerate_chars(str, 0);
9266}
9267
9268/*
9269 * call-seq:
9270 * chars -> array_of_characters
9271 *
9272 * :include: doc/string/chars.rdoc
9273 *
9274 */
9275
9276static VALUE
9277rb_str_chars(VALUE str)
9278{
9279 VALUE ary = WANTARRAY("chars", rb_str_strlen(str));
9280 return rb_str_enumerate_chars(str, ary);
9281}
9282
9283static VALUE
9284rb_str_enumerate_codepoints(VALUE str, VALUE ary)
9285{
9286 VALUE orig = str;
9287 int n;
9288 unsigned int c;
9289 const char *ptr, *end;
9290 rb_encoding *enc;
9291
9292 if (single_byte_optimizable(str))
9293 return rb_str_enumerate_bytes(str, ary);
9294
9295 str = rb_str_new_frozen(str);
9296 ptr = RSTRING_PTR(str);
9297 end = RSTRING_END(str);
9298 enc = STR_ENC_GET(str);
9299
9300 while (ptr < end) {
9301 c = rb_enc_codepoint_len(ptr, end, &n, enc);
9302 ENUM_ELEM(ary, UINT2NUM(c));
9303 ptr += n;
9304 }
9305 RB_GC_GUARD(str);
9306 if (ary)
9307 return ary;
9308 else
9309 return orig;
9310}
9311
9312/*
9313 * call-seq:
9314 * each_codepoint {|integer| ... } -> self
9315 * each_codepoint -> enumerator
9316 *
9317 * :include: doc/string/each_codepoint.rdoc
9318 *
9319 */
9320
9321static VALUE
9322rb_str_each_codepoint(VALUE str)
9323{
9324 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_char_size);
9325 return rb_str_enumerate_codepoints(str, 0);
9326}
9327
9328/*
9329 * call-seq:
9330 * codepoints -> array_of_integers
9331 *
9332 * :include: doc/string/codepoints.rdoc
9333 *
9334 */
9335
9336static VALUE
9337rb_str_codepoints(VALUE str)
9338{
9339 VALUE ary = WANTARRAY("codepoints", rb_str_strlen(str));
9340 return rb_str_enumerate_codepoints(str, ary);
9341}
9342
9343static regex_t *
9344get_reg_grapheme_cluster(rb_encoding *enc)
9345{
9346 int encidx = rb_enc_to_index(enc);
9347 regex_t *reg_grapheme_cluster = NULL;
9348 static regex_t *reg_grapheme_cluster_utf8 = NULL;
9349
9350 /* synchronize */
9351 if (encidx == rb_utf8_encindex() && reg_grapheme_cluster_utf8) {
9352 reg_grapheme_cluster = reg_grapheme_cluster_utf8;
9353 }
9354 if (!reg_grapheme_cluster) {
9355 const OnigUChar source_ascii[] = "\\X";
9356 OnigErrorInfo einfo;
9357 const OnigUChar *source = source_ascii;
9358 size_t source_len = sizeof(source_ascii) - 1;
9359 switch (encidx) {
9360#define CHARS_16BE(x) (OnigUChar)((x)>>8), (OnigUChar)(x)
9361#define CHARS_16LE(x) (OnigUChar)(x), (OnigUChar)((x)>>8)
9362#define CHARS_32BE(x) CHARS_16BE((x)>>16), CHARS_16BE(x)
9363#define CHARS_32LE(x) CHARS_16LE(x), CHARS_16LE((x)>>16)
9364#define CASE_UTF(e) \
9365 case ENCINDEX_UTF_##e: { \
9366 static const OnigUChar source_UTF_##e[] = {CHARS_##e('\\'), CHARS_##e('X')}; \
9367 source = source_UTF_##e; \
9368 source_len = sizeof(source_UTF_##e); \
9369 break; \
9370 }
9371 CASE_UTF(16BE); CASE_UTF(16LE); CASE_UTF(32BE); CASE_UTF(32LE);
9372#undef CASE_UTF
9373#undef CHARS_16BE
9374#undef CHARS_16LE
9375#undef CHARS_32BE
9376#undef CHARS_32LE
9377 }
9378 int r = onig_new(&reg_grapheme_cluster, source, source + source_len,
9379 ONIG_OPTION_DEFAULT, enc, OnigDefaultSyntax, &einfo);
9380 if (r) {
9381 UChar message[ONIG_MAX_ERROR_MESSAGE_LEN];
9382 onig_error_code_to_str(message, r, &einfo);
9383 rb_fatal("cannot compile grapheme cluster regexp: %s", (char *)message);
9384 }
9385 if (encidx == rb_utf8_encindex()) {
9386 reg_grapheme_cluster_utf8 = reg_grapheme_cluster;
9387 }
9388 }
9389 return reg_grapheme_cluster;
9390}
9391
9392static VALUE
9393rb_str_each_grapheme_cluster_size(VALUE str, VALUE args, VALUE eobj)
9394{
9395 size_t grapheme_cluster_count = 0;
9396 regex_t *reg_grapheme_cluster = NULL;
9397 rb_encoding *enc = get_encoding(str);
9398 const char *ptr, *end;
9399
9400 if (!rb_enc_unicode_p(enc)) {
9401 return rb_str_length(str);
9402 }
9403
9404 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9405 ptr = RSTRING_PTR(str);
9406 end = RSTRING_END(str);
9407
9408 while (ptr < end) {
9409 OnigPosition len = onig_match(reg_grapheme_cluster,
9410 (const OnigUChar *)ptr, (const OnigUChar *)end,
9411 (const OnigUChar *)ptr, NULL, 0);
9412 if (len <= 0) break;
9413 grapheme_cluster_count++;
9414 ptr += len;
9415 }
9416
9417 return SIZET2NUM(grapheme_cluster_count);
9418}
9419
9420static VALUE
9421rb_str_enumerate_grapheme_clusters(VALUE str, VALUE ary)
9422{
9423 VALUE orig = str;
9424 regex_t *reg_grapheme_cluster = NULL;
9425 rb_encoding *enc = get_encoding(str);
9426 const char *ptr0, *ptr, *end;
9427
9428 if (!rb_enc_unicode_p(enc)) {
9429 return rb_str_enumerate_chars(str, ary);
9430 }
9431
9432 if (!ary) str = rb_str_new_frozen(str);
9433 reg_grapheme_cluster = get_reg_grapheme_cluster(enc);
9434 ptr0 = ptr = RSTRING_PTR(str);
9435 end = RSTRING_END(str);
9436
9437 while (ptr < end) {
9438 OnigPosition len = onig_match(reg_grapheme_cluster,
9439 (const OnigUChar *)ptr, (const OnigUChar *)end,
9440 (const OnigUChar *)ptr, NULL, 0);
9441 if (len <= 0) break;
9442 ENUM_ELEM(ary, rb_str_subseq(str, ptr-ptr0, len));
9443 ptr += len;
9444 }
9445 RB_GC_GUARD(str);
9446 if (ary)
9447 return ary;
9448 else
9449 return orig;
9450}
9451
9452/*
9453 * call-seq:
9454 * each_grapheme_cluster {|gc| ... } -> self
9455 * each_grapheme_cluster -> enumerator
9456 *
9457 * :include: doc/string/each_grapheme_cluster.rdoc
9458 *
9459 */
9460
9461static VALUE
9462rb_str_each_grapheme_cluster(VALUE str)
9463{
9464 RETURN_SIZED_ENUMERATOR(str, 0, 0, rb_str_each_grapheme_cluster_size);
9465 return rb_str_enumerate_grapheme_clusters(str, 0);
9466}
9467
9468/*
9469 * call-seq:
9470 * grapheme_clusters -> array_of_grapheme_clusters
9471 *
9472 * :include: doc/string/grapheme_clusters.rdoc
9473 *
9474 */
9475
9476static VALUE
9477rb_str_grapheme_clusters(VALUE str)
9478{
9479 VALUE ary = WANTARRAY("grapheme_clusters", rb_str_strlen(str));
9480 return rb_str_enumerate_grapheme_clusters(str, ary);
9481}
9482
9483static long
9484chopped_length(VALUE str)
9485{
9486 rb_encoding *enc = STR_ENC_GET(str);
9487 const char *p, *p2, *beg, *end;
9488
9489 beg = RSTRING_PTR(str);
9490 end = beg + RSTRING_LEN(str);
9491 if (beg >= end) return 0;
9492 p = rb_enc_prev_char(beg, end, end, enc);
9493 if (!p) return 0;
9494 if (p > beg && rb_enc_ascget(p, end, 0, enc) == '\n') {
9495 p2 = rb_enc_prev_char(beg, p, end, enc);
9496 if (p2 && rb_enc_ascget(p2, end, 0, enc) == '\r') p = p2;
9497 }
9498 return p - beg;
9499}
9500
9501/*
9502 * call-seq:
9503 * chop! -> self or nil
9504 *
9505 * Like String#chop, but modifies +self+ in place;
9506 * returns +nil+ if +self+ is empty, +self+ otherwise.
9507 *
9508 * Related: String#chomp!.
9509 */
9510
9511static VALUE
9512rb_str_chop_bang(VALUE str)
9513{
9514 str_modify_keep_cr(str);
9515 if (RSTRING_LEN(str) > 0) {
9516 long len;
9517 len = chopped_length(str);
9518 STR_SET_LEN(str, len);
9519 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9520 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9522 }
9523 return str;
9524 }
9525 return Qnil;
9526}
9527
9528
9529/*
9530 * call-seq:
9531 * chop -> new_string
9532 *
9533 * :include: doc/string/chop.rdoc
9534 *
9535 */
9536
9537static VALUE
9538rb_str_chop(VALUE str)
9539{
9540 return rb_str_subseq(str, 0, chopped_length(str));
9541}
9542
9543static long
9544smart_chomp(VALUE str, const char *e, const char *p)
9545{
9546 rb_encoding *enc = rb_enc_get(str);
9547 if (rb_enc_mbminlen(enc) > 1) {
9548 const char *pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9549 if (rb_enc_is_newline(pp, e, enc)) {
9550 e = pp;
9551 }
9552 pp = e - rb_enc_mbminlen(enc);
9553 if (pp >= p) {
9554 pp = rb_enc_left_char_head(p, pp, e, enc);
9555 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9556 e = pp;
9557 }
9558 }
9559 }
9560 else {
9561 switch (*(e-1)) { /* not e[-1] to get rid of VC bug */
9562 case '\n':
9563 if (--e > p && *(e-1) == '\r') {
9564 --e;
9565 }
9566 break;
9567 case '\r':
9568 --e;
9569 break;
9570 }
9571 }
9572 return e - p;
9573}
9574
9575static long
9576chompped_length(VALUE str, VALUE rs)
9577{
9578 rb_encoding *enc;
9579 int newline;
9580 char *pp, *e, *rsptr;
9581 long rslen;
9582 char *const p = RSTRING_PTR(str);
9583 long len = RSTRING_LEN(str);
9584
9585 if (len == 0) return 0;
9586 e = p + len;
9587 if (rs == rb_default_rs) {
9588 return smart_chomp(str, e, p);
9589 }
9590
9591 enc = rb_enc_get(str);
9592 RSTRING_GETMEM(rs, rsptr, rslen);
9593 if (rslen == 0) {
9594 if (rb_enc_mbminlen(enc) > 1) {
9595 while (e > p) {
9596 pp = rb_enc_left_char_head(p, e-rb_enc_mbminlen(enc), e, enc);
9597 if (!rb_enc_is_newline(pp, e, enc)) break;
9598 e = pp;
9599 pp -= rb_enc_mbminlen(enc);
9600 if (pp >= p) {
9601 pp = rb_enc_left_char_head(p, pp, e, enc);
9602 if (rb_enc_ascget(pp, e, 0, enc) == '\r') {
9603 e = pp;
9604 }
9605 }
9606 }
9607 }
9608 else {
9609 while (e > p && *(e-1) == '\n') {
9610 --e;
9611 if (e > p && *(e-1) == '\r')
9612 --e;
9613 }
9614 }
9615 return e - p;
9616 }
9617 if (rslen > len) return len;
9618
9619 enc = rb_enc_get(rs);
9620 newline = rsptr[rslen-1];
9621 if (rslen == rb_enc_mbminlen(enc)) {
9622 if (rslen == 1) {
9623 if (newline == '\n')
9624 return smart_chomp(str, e, p);
9625 }
9626 else {
9627 if (rb_enc_is_newline(rsptr, rsptr+rslen, enc))
9628 return smart_chomp(str, e, p);
9629 }
9630 }
9631
9632 enc = rb_enc_check(str, rs);
9633 if (is_broken_string(rs)) {
9634 return len;
9635 }
9636 pp = e - rslen;
9637 if (p[len-1] == newline &&
9638 (rslen <= 1 ||
9639 memcmp(rsptr, pp, rslen) == 0)) {
9640 if (at_char_boundary(p, pp, e, enc))
9641 return len - rslen;
9642 RB_GC_GUARD(rs);
9643 }
9644 return len;
9645}
9646
9652static VALUE
9653chomp_rs(int argc, const VALUE *argv)
9654{
9655 rb_check_arity(argc, 0, 1);
9656 if (argc > 0) {
9657 VALUE rs = argv[0];
9658 if (!NIL_P(rs)) StringValue(rs);
9659 return rs;
9660 }
9661 else {
9662 return rb_rs;
9663 }
9664}
9665
9666VALUE
9667rb_str_chomp_string(VALUE str, VALUE rs)
9668{
9669 long olen = RSTRING_LEN(str);
9670 long len = chompped_length(str, rs);
9671 if (len >= olen) return Qnil;
9672 str_modify_keep_cr(str);
9673 STR_SET_LEN(str, len);
9674 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
9675 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
9677 }
9678 return str;
9679}
9680
9681/*
9682 * call-seq:
9683 * chomp!(line_sep = $/) -> self or nil
9684 *
9685 * Like String#chomp, but modifies +self+ in place;
9686 * returns +nil+ if no modification made, +self+ otherwise.
9687 *
9688 */
9689
9690static VALUE
9691rb_str_chomp_bang(int argc, VALUE *argv, VALUE str)
9692{
9693 VALUE rs;
9694 str_modifiable(str);
9695 if (RSTRING_LEN(str) == 0 && argc < 2) return Qnil;
9696 rs = chomp_rs(argc, argv);
9697 if (NIL_P(rs)) return Qnil;
9698 return rb_str_chomp_string(str, rs);
9699}
9700
9701
9702/*
9703 * call-seq:
9704 * chomp(line_sep = $/) -> new_string
9705 *
9706 * :include: doc/string/chomp.rdoc
9707 *
9708 */
9709
9710static VALUE
9711rb_str_chomp(int argc, VALUE *argv, VALUE str)
9712{
9713 VALUE rs = chomp_rs(argc, argv);
9714 if (NIL_P(rs)) return str_duplicate(rb_cString, str);
9715 return rb_str_subseq(str, 0, chompped_length(str, rs));
9716}
9717
9718static long
9719lstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9720{
9721 const char *const start = s;
9722
9723 if (!s || s >= e) return 0;
9724
9725 /* remove spaces at head */
9726 if (single_byte_optimizable(str)) {
9727 while (s < e && (*s == '\0' || ascii_isspace(*s))) s++;
9728 }
9729 else {
9730 while (s < e) {
9731 int n;
9732 unsigned int cc = rb_enc_codepoint_len(s, e, &n, enc);
9733
9734 if (cc && !rb_isspace(cc)) break;
9735 s += n;
9736 }
9737 }
9738 return s - start;
9739}
9740
9741/*
9742 * call-seq:
9743 * lstrip! -> self or nil
9744 *
9745 * Like String#lstrip, except that any modifications are made in +self+;
9746 * returns +self+ if any modification are made, +nil+ otherwise.
9747 *
9748 * Related: String#rstrip!, String#strip!.
9749 */
9750
9751static VALUE
9752rb_str_lstrip_bang(VALUE str)
9753{
9754 rb_encoding *enc;
9755 char *start, *s;
9756 long olen, loffset;
9757
9758 str_modify_keep_cr(str);
9759 enc = STR_ENC_GET(str);
9760 RSTRING_GETMEM(str, start, olen);
9761 loffset = lstrip_offset(str, start, start+olen, enc);
9762 if (loffset > 0) {
9763 long len = olen-loffset;
9764 s = start + loffset;
9765 memmove(start, s, len);
9766 STR_SET_LEN(str, len);
9767 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9768 return str;
9769 }
9770 return Qnil;
9771}
9772
9773
9774/*
9775 * call-seq:
9776 * lstrip -> new_string
9777 *
9778 * Returns a copy of +self+ with leading whitespace removed;
9779 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9780 *
9781 * whitespace = "\x00\t\n\v\f\r "
9782 * s = whitespace + 'abc' + whitespace
9783 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9784 * s.lstrip # => "abc\u0000\t\n\v\f\r "
9785 *
9786 * Related: String#rstrip, String#strip.
9787 */
9788
9789static VALUE
9790rb_str_lstrip(VALUE str)
9791{
9792 char *start;
9793 long len, loffset;
9794 RSTRING_GETMEM(str, start, len);
9795 loffset = lstrip_offset(str, start, start+len, STR_ENC_GET(str));
9796 if (loffset <= 0) return str_duplicate(rb_cString, str);
9797 return rb_str_subseq(str, loffset, len - loffset);
9798}
9799
9800static long
9801rstrip_offset(VALUE str, const char *s, const char *e, rb_encoding *enc)
9802{
9803 const char *t;
9804
9805 rb_str_check_dummy_enc(enc);
9807 rb_raise(rb_eEncCompatError, "invalid byte sequence in %s", rb_enc_name(enc));
9808 }
9809 if (!s || s >= e) return 0;
9810 t = e;
9811
9812 /* remove trailing spaces or '\0's */
9813 if (single_byte_optimizable(str)) {
9814 unsigned char c;
9815 while (s < t && ((c = *(t-1)) == '\0' || ascii_isspace(c))) t--;
9816 }
9817 else {
9818 char *tp;
9819
9820 while ((tp = rb_enc_prev_char(s, t, e, enc)) != NULL) {
9821 unsigned int c = rb_enc_codepoint(tp, e, enc);
9822 if (c && !rb_isspace(c)) break;
9823 t = tp;
9824 }
9825 }
9826 return e - t;
9827}
9828
9829/*
9830 * call-seq:
9831 * rstrip! -> self or nil
9832 *
9833 * Like String#rstrip, except that any modifications are made in +self+;
9834 * returns +self+ if any modification are made, +nil+ otherwise.
9835 *
9836 * Related: String#lstrip!, String#strip!.
9837 */
9838
9839static VALUE
9840rb_str_rstrip_bang(VALUE str)
9841{
9842 rb_encoding *enc;
9843 char *start;
9844 long olen, roffset;
9845
9846 str_modify_keep_cr(str);
9847 enc = STR_ENC_GET(str);
9848 RSTRING_GETMEM(str, start, olen);
9849 roffset = rstrip_offset(str, start, start+olen, enc);
9850 if (roffset > 0) {
9851 long len = olen - roffset;
9852
9853 STR_SET_LEN(str, len);
9854 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9855 return str;
9856 }
9857 return Qnil;
9858}
9859
9860
9861/*
9862 * call-seq:
9863 * rstrip -> new_string
9864 *
9865 * Returns a copy of the receiver with trailing whitespace removed;
9866 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9867 *
9868 * whitespace = "\x00\t\n\v\f\r "
9869 * s = whitespace + 'abc' + whitespace
9870 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9871 * s.rstrip # => "\u0000\t\n\v\f\r abc"
9872 *
9873 * Related: String#lstrip, String#strip.
9874 */
9875
9876static VALUE
9877rb_str_rstrip(VALUE str)
9878{
9879 rb_encoding *enc;
9880 char *start;
9881 long olen, roffset;
9882
9883 enc = STR_ENC_GET(str);
9884 RSTRING_GETMEM(str, start, olen);
9885 roffset = rstrip_offset(str, start, start+olen, enc);
9886
9887 if (roffset <= 0) return str_duplicate(rb_cString, str);
9888 return rb_str_subseq(str, 0, olen-roffset);
9889}
9890
9891
9892/*
9893 * call-seq:
9894 * strip! -> self or nil
9895 *
9896 * Like String#strip, except that any modifications are made in +self+;
9897 * returns +self+ if any modification are made, +nil+ otherwise.
9898 *
9899 * Related: String#lstrip!, String#strip!.
9900 */
9901
9902static VALUE
9903rb_str_strip_bang(VALUE str)
9904{
9905 char *start;
9906 long olen, loffset, roffset;
9907 rb_encoding *enc;
9908
9909 str_modify_keep_cr(str);
9910 enc = STR_ENC_GET(str);
9911 RSTRING_GETMEM(str, start, olen);
9912 loffset = lstrip_offset(str, start, start+olen, enc);
9913 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9914
9915 if (loffset > 0 || roffset > 0) {
9916 long len = olen-roffset;
9917 if (loffset > 0) {
9918 len -= loffset;
9919 memmove(start, start + loffset, len);
9920 }
9921 STR_SET_LEN(str, len);
9922 TERM_FILL(start+len, rb_enc_mbminlen(enc));
9923 return str;
9924 }
9925 return Qnil;
9926}
9927
9928
9929/*
9930 * call-seq:
9931 * strip -> new_string
9932 *
9933 * Returns a copy of the receiver with leading and trailing whitespace removed;
9934 * see {Whitespace in Strings}[rdoc-ref:String@Whitespace+in+Strings]:
9935 *
9936 * whitespace = "\x00\t\n\v\f\r "
9937 * s = whitespace + 'abc' + whitespace
9938 * s # => "\u0000\t\n\v\f\r abc\u0000\t\n\v\f\r "
9939 * s.strip # => "abc"
9940 *
9941 * Related: String#lstrip, String#rstrip.
9942 */
9943
9944static VALUE
9945rb_str_strip(VALUE str)
9946{
9947 char *start;
9948 long olen, loffset, roffset;
9949 rb_encoding *enc = STR_ENC_GET(str);
9950
9951 RSTRING_GETMEM(str, start, olen);
9952 loffset = lstrip_offset(str, start, start+olen, enc);
9953 roffset = rstrip_offset(str, start+loffset, start+olen, enc);
9954
9955 if (loffset <= 0 && roffset <= 0) return str_duplicate(rb_cString, str);
9956 return rb_str_subseq(str, loffset, olen-loffset-roffset);
9957}
9958
9959static VALUE
9960scan_once(VALUE str, VALUE pat, long *start, int set_backref_str)
9961{
9962 VALUE result = Qnil;
9963 long end, pos = rb_pat_search(pat, str, *start, set_backref_str);
9964 if (pos >= 0) {
9965 VALUE match;
9966 struct re_registers *regs;
9967 if (BUILTIN_TYPE(pat) == T_STRING) {
9968 regs = NULL;
9969 end = pos + RSTRING_LEN(pat);
9970 }
9971 else {
9972 match = rb_backref_get();
9973 regs = RMATCH_REGS(match);
9974 pos = BEG(0);
9975 end = END(0);
9976 }
9977
9978 if (pos == end) {
9979 rb_encoding *enc = STR_ENC_GET(str);
9980 /*
9981 * Always consume at least one character of the input string
9982 */
9983 if (RSTRING_LEN(str) > end)
9984 *start = end + rb_enc_fast_mbclen(RSTRING_PTR(str) + end,
9985 RSTRING_END(str), enc);
9986 else
9987 *start = end + 1;
9988 }
9989 else {
9990 *start = end;
9991 }
9992
9993 if (!regs || regs->num_regs == 1) {
9994 result = rb_str_subseq(str, pos, end - pos);
9995 return result;
9996 }
9997 else {
9998 result = rb_ary_new2(regs->num_regs);
9999 for (int i = 1; i < regs->num_regs; i++) {
10000 VALUE s = Qnil;
10001 if (BEG(i) >= 0) {
10002 s = rb_str_subseq(str, BEG(i), END(i)-BEG(i));
10003 }
10004
10005 rb_ary_push(result, s);
10006 }
10007 }
10008
10009 RB_GC_GUARD(match);
10010 }
10011
10012 return result;
10013}
10014
10015
10016/*
10017 * call-seq:
10018 * scan(string_or_regexp) -> array
10019 * scan(string_or_regexp) {|matches| ... } -> self
10020 *
10021 * Matches a pattern against +self+; the pattern is:
10022 *
10023 * - +string_or_regexp+ itself, if it is a Regexp.
10024 * - <tt>Regexp.quote(string_or_regexp)</tt>, if +string_or_regexp+ is a string.
10025 *
10026 * Iterates through +self+, generating a collection of matching results:
10027 *
10028 * - If the pattern contains no groups, each result is the
10029 * matched string, <code>$&</code>.
10030 * - If the pattern contains groups, each result is an array
10031 * containing one entry per group.
10032 *
10033 * With no block given, returns an array of the results:
10034 *
10035 * s = 'cruel world'
10036 * s.scan(/\w+/) # => ["cruel", "world"]
10037 * s.scan(/.../) # => ["cru", "el ", "wor"]
10038 * s.scan(/(...)/) # => [["cru"], ["el "], ["wor"]]
10039 * s.scan(/(..)(..)/) # => [["cr", "ue"], ["l ", "wo"]]
10040 *
10041 * With a block given, calls the block with each result; returns +self+:
10042 *
10043 * s.scan(/\w+/) {|w| print "<<#{w}>> " }
10044 * print "\n"
10045 * s.scan(/(.)(.)/) {|x,y| print y, x }
10046 * print "\n"
10047 *
10048 * Output:
10049 *
10050 * <<cruel>> <<world>>
10051 * rceu lowlr
10052 *
10053 */
10054
10055static VALUE
10056rb_str_scan(VALUE str, VALUE pat)
10057{
10058 VALUE result;
10059 long start = 0;
10060 long last = -1, prev = 0;
10061 char *p = RSTRING_PTR(str); long len = RSTRING_LEN(str);
10062
10063 pat = get_pat_quoted(pat, 1);
10064 mustnot_broken(str);
10065 if (!rb_block_given_p()) {
10066 VALUE ary = rb_ary_new();
10067
10068 while (!NIL_P(result = scan_once(str, pat, &start, 0))) {
10069 last = prev;
10070 prev = start;
10071 rb_ary_push(ary, result);
10072 }
10073 if (last >= 0) rb_pat_search(pat, str, last, 1);
10074 else rb_backref_set(Qnil);
10075 return ary;
10076 }
10077
10078 while (!NIL_P(result = scan_once(str, pat, &start, 1))) {
10079 last = prev;
10080 prev = start;
10081 rb_yield(result);
10082 str_mod_check(str, p, len);
10083 }
10084 if (last >= 0) rb_pat_search(pat, str, last, 1);
10085 return str;
10086}
10087
10088
10089/*
10090 * call-seq:
10091 * hex -> integer
10092 *
10093 * Interprets the leading substring of +self+ as a string of hexadecimal digits
10094 * (with an optional sign and an optional <code>0x</code>) and returns the
10095 * corresponding number;
10096 * returns zero if there is no such leading substring:
10097 *
10098 * '0x0a'.hex # => 10
10099 * '-1234'.hex # => -4660
10100 * '0'.hex # => 0
10101 * 'non-numeric'.hex # => 0
10102 *
10103 * Related: String#oct.
10104 *
10105 */
10106
10107static VALUE
10108rb_str_hex(VALUE str)
10109{
10110 return rb_str_to_inum(str, 16, FALSE);
10111}
10112
10113
10114/*
10115 * call-seq:
10116 * oct -> integer
10117 *
10118 * Interprets the leading substring of +self+ as a string of octal digits
10119 * (with an optional sign) and returns the corresponding number;
10120 * returns zero if there is no such leading substring:
10121 *
10122 * '123'.oct # => 83
10123 * '-377'.oct # => -255
10124 * '0377non-numeric'.oct # => 255
10125 * 'non-numeric'.oct # => 0
10126 *
10127 * If +self+ starts with <tt>0</tt>, radix indicators are honored;
10128 * see Kernel#Integer.
10129 *
10130 * Related: String#hex.
10131 *
10132 */
10133
10134static VALUE
10135rb_str_oct(VALUE str)
10136{
10137 return rb_str_to_inum(str, -8, FALSE);
10138}
10139
10140#ifndef HAVE_CRYPT_R
10141# include "ruby/thread_native.h"
10142# include "ruby/atomic.h"
10143
10144static struct {
10145 rb_nativethread_lock_t lock;
10146} crypt_mutex = {PTHREAD_MUTEX_INITIALIZER};
10147
10148static void
10149crypt_mutex_initialize(void)
10150{
10151}
10152#endif
10153
10154/*
10155 * call-seq:
10156 * crypt(salt_str) -> new_string
10157 *
10158 * Returns the string generated by calling <code>crypt(3)</code>
10159 * standard library function with <code>str</code> and
10160 * <code>salt_str</code>, in this order, as its arguments. Please do
10161 * not use this method any longer. It is legacy; provided only for
10162 * backward compatibility with ruby scripts in earlier days. It is
10163 * bad to use in contemporary programs for several reasons:
10164 *
10165 * * Behaviour of C's <code>crypt(3)</code> depends on the OS it is
10166 * run. The generated string lacks data portability.
10167 *
10168 * * On some OSes such as Mac OS, <code>crypt(3)</code> never fails
10169 * (i.e. silently ends up in unexpected results).
10170 *
10171 * * On some OSes such as Mac OS, <code>crypt(3)</code> is not
10172 * thread safe.
10173 *
10174 * * So-called "traditional" usage of <code>crypt(3)</code> is very
10175 * very very weak. According to its manpage, Linux's traditional
10176 * <code>crypt(3)</code> output has only 2**56 variations; too
10177 * easy to brute force today. And this is the default behaviour.
10178 *
10179 * * In order to make things robust some OSes implement so-called
10180 * "modular" usage. To go through, you have to do a complex
10181 * build-up of the <code>salt_str</code> parameter, by hand.
10182 * Failure in generation of a proper salt string tends not to
10183 * yield any errors; typos in parameters are normally not
10184 * detectable.
10185 *
10186 * * For instance, in the following example, the second invocation
10187 * of String#crypt is wrong; it has a typo in "round=" (lacks
10188 * "s"). However the call does not fail and something unexpected
10189 * is generated.
10190 *
10191 * "foo".crypt("$5$rounds=1000$salt$") # OK, proper usage
10192 * "foo".crypt("$5$round=1000$salt$") # Typo not detected
10193 *
10194 * * Even in the "modular" mode, some hash functions are considered
10195 * archaic and no longer recommended at all; for instance module
10196 * <code>$1$</code> is officially abandoned by its author: see
10197 * http://phk.freebsd.dk/sagas/md5crypt_eol/ . For another
10198 * instance module <code>$3$</code> is considered completely
10199 * broken: see the manpage of FreeBSD.
10200 *
10201 * * On some OS such as Mac OS, there is no modular mode. Yet, as
10202 * written above, <code>crypt(3)</code> on Mac OS never fails.
10203 * This means even if you build up a proper salt string it
10204 * generates a traditional DES hash anyways, and there is no way
10205 * for you to be aware of.
10206 *
10207 * "foo".crypt("$5$rounds=1000$salt$") # => "$5fNPQMxC5j6."
10208 *
10209 * If for some reason you cannot migrate to other secure contemporary
10210 * password hashing algorithms, install the string-crypt gem and
10211 * <code>require 'string/crypt'</code> to continue using it.
10212 */
10213
10214static VALUE
10215rb_str_crypt(VALUE str, VALUE salt)
10216{
10217#ifdef HAVE_CRYPT_R
10218 VALUE databuf;
10219 struct crypt_data *data;
10220# define CRYPT_END() ALLOCV_END(databuf)
10221#else
10222 extern char *crypt(const char *, const char *);
10223# define CRYPT_END() rb_nativethread_lock_unlock(&crypt_mutex.lock)
10224#endif
10225 VALUE result;
10226 const char *s, *saltp;
10227 char *res;
10228#ifdef BROKEN_CRYPT
10229 char salt_8bit_clean[3];
10230#endif
10231
10232 StringValue(salt);
10233 mustnot_wchar(str);
10234 mustnot_wchar(salt);
10235 s = StringValueCStr(str);
10236 saltp = RSTRING_PTR(salt);
10237 if (RSTRING_LEN(salt) < 2 || !saltp[0] || !saltp[1]) {
10238 rb_raise(rb_eArgError, "salt too short (need >=2 bytes)");
10239 }
10240
10241#ifdef BROKEN_CRYPT
10242 if (!ISASCII((unsigned char)saltp[0]) || !ISASCII((unsigned char)saltp[1])) {
10243 salt_8bit_clean[0] = saltp[0] & 0x7f;
10244 salt_8bit_clean[1] = saltp[1] & 0x7f;
10245 salt_8bit_clean[2] = '\0';
10246 saltp = salt_8bit_clean;
10247 }
10248#endif
10249#ifdef HAVE_CRYPT_R
10250 data = ALLOCV(databuf, sizeof(struct crypt_data));
10251# ifdef HAVE_STRUCT_CRYPT_DATA_INITIALIZED
10252 data->initialized = 0;
10253# endif
10254 res = crypt_r(s, saltp, data);
10255#else
10256 crypt_mutex_initialize();
10257 rb_nativethread_lock_lock(&crypt_mutex.lock);
10258 res = crypt(s, saltp);
10259#endif
10260 if (!res) {
10261 int err = errno;
10262 CRYPT_END();
10263 rb_syserr_fail(err, "crypt");
10264 }
10265 result = rb_str_new_cstr(res);
10266 CRYPT_END();
10267 return result;
10268}
10269
10270
10271/*
10272 * call-seq:
10273 * ord -> integer
10274 *
10275 * :include: doc/string/ord.rdoc
10276 *
10277 */
10278
10279static VALUE
10280rb_str_ord(VALUE s)
10281{
10282 unsigned int c;
10283
10284 c = rb_enc_codepoint(RSTRING_PTR(s), RSTRING_END(s), STR_ENC_GET(s));
10285 return UINT2NUM(c);
10286}
10287/*
10288 * call-seq:
10289 * sum(n = 16) -> integer
10290 *
10291 * :include: doc/string/sum.rdoc
10292 *
10293 */
10294
10295static VALUE
10296rb_str_sum(int argc, VALUE *argv, VALUE str)
10297{
10298 int bits = 16;
10299 char *ptr, *p, *pend;
10300 long len;
10301 VALUE sum = INT2FIX(0);
10302 unsigned long sum0 = 0;
10303
10304 if (rb_check_arity(argc, 0, 1) && (bits = NUM2INT(argv[0])) < 0) {
10305 bits = 0;
10306 }
10307 ptr = p = RSTRING_PTR(str);
10308 len = RSTRING_LEN(str);
10309 pend = p + len;
10310
10311 while (p < pend) {
10312 if (FIXNUM_MAX - UCHAR_MAX < sum0) {
10313 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10314 str_mod_check(str, ptr, len);
10315 sum0 = 0;
10316 }
10317 sum0 += (unsigned char)*p;
10318 p++;
10319 }
10320
10321 if (bits == 0) {
10322 if (sum0) {
10323 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10324 }
10325 }
10326 else {
10327 if (sum == INT2FIX(0)) {
10328 if (bits < (int)sizeof(long)*CHAR_BIT) {
10329 sum0 &= (((unsigned long)1)<<bits)-1;
10330 }
10331 sum = LONG2FIX(sum0);
10332 }
10333 else {
10334 VALUE mod;
10335
10336 if (sum0) {
10337 sum = rb_funcall(sum, '+', 1, LONG2FIX(sum0));
10338 }
10339
10340 mod = rb_funcall(INT2FIX(1), idLTLT, 1, INT2FIX(bits));
10341 mod = rb_funcall(mod, '-', 1, INT2FIX(1));
10342 sum = rb_funcall(sum, '&', 1, mod);
10343 }
10344 }
10345 return sum;
10346}
10347
10348static VALUE
10349rb_str_justify(int argc, VALUE *argv, VALUE str, char jflag)
10350{
10351 rb_encoding *enc;
10352 VALUE w;
10353 long width, len, flen = 1, fclen = 1;
10354 VALUE res;
10355 char *p;
10356 const char *f = " ";
10357 long n, size, llen, rlen, llen2 = 0, rlen2 = 0;
10358 VALUE pad;
10359 int singlebyte = 1, cr;
10360 int termlen;
10361
10362 rb_scan_args(argc, argv, "11", &w, &pad);
10363 enc = STR_ENC_GET(str);
10364 termlen = rb_enc_mbminlen(enc);
10365 width = NUM2LONG(w);
10366 if (argc == 2) {
10367 StringValue(pad);
10368 enc = rb_enc_check(str, pad);
10369 f = RSTRING_PTR(pad);
10370 flen = RSTRING_LEN(pad);
10371 fclen = str_strlen(pad, enc); /* rb_enc_check */
10372 singlebyte = single_byte_optimizable(pad);
10373 if (flen == 0 || fclen == 0) {
10374 rb_raise(rb_eArgError, "zero width padding");
10375 }
10376 }
10377 len = str_strlen(str, enc); /* rb_enc_check */
10378 if (width < 0 || len >= width) return str_duplicate(rb_cString, str);
10379 n = width - len;
10380 llen = (jflag == 'l') ? 0 : ((jflag == 'r') ? n : n/2);
10381 rlen = n - llen;
10382 cr = ENC_CODERANGE(str);
10383 if (flen > 1) {
10384 llen2 = str_offset(f, f + flen, llen % fclen, enc, singlebyte);
10385 rlen2 = str_offset(f, f + flen, rlen % fclen, enc, singlebyte);
10386 }
10387 size = RSTRING_LEN(str);
10388 if ((len = llen / fclen + rlen / fclen) >= LONG_MAX / flen ||
10389 (len *= flen) >= LONG_MAX - llen2 - rlen2 ||
10390 (len += llen2 + rlen2) >= LONG_MAX - size) {
10391 rb_raise(rb_eArgError, "argument too big");
10392 }
10393 len += size;
10394 res = str_new0(rb_cString, 0, len, termlen);
10395 p = RSTRING_PTR(res);
10396 if (flen <= 1) {
10397 memset(p, *f, llen);
10398 p += llen;
10399 }
10400 else {
10401 while (llen >= fclen) {
10402 memcpy(p,f,flen);
10403 p += flen;
10404 llen -= fclen;
10405 }
10406 if (llen > 0) {
10407 memcpy(p, f, llen2);
10408 p += llen2;
10409 }
10410 }
10411 memcpy(p, RSTRING_PTR(str), size);
10412 p += size;
10413 if (flen <= 1) {
10414 memset(p, *f, rlen);
10415 p += rlen;
10416 }
10417 else {
10418 while (rlen >= fclen) {
10419 memcpy(p,f,flen);
10420 p += flen;
10421 rlen -= fclen;
10422 }
10423 if (rlen > 0) {
10424 memcpy(p, f, rlen2);
10425 p += rlen2;
10426 }
10427 }
10428 TERM_FILL(p, termlen);
10429 STR_SET_LEN(res, p-RSTRING_PTR(res));
10430 rb_enc_associate(res, enc);
10431 if (argc == 2)
10432 cr = ENC_CODERANGE_AND(cr, ENC_CODERANGE(pad));
10433 if (cr != ENC_CODERANGE_BROKEN)
10434 ENC_CODERANGE_SET(res, cr);
10435
10436 RB_GC_GUARD(pad);
10437 return res;
10438}
10439
10440
10441/*
10442 * call-seq:
10443 * ljust(size, pad_string = ' ') -> new_string
10444 *
10445 * :include: doc/string/ljust.rdoc
10446 *
10447 * Related: String#rjust, String#center.
10448 *
10449 */
10450
10451static VALUE
10452rb_str_ljust(int argc, VALUE *argv, VALUE str)
10453{
10454 return rb_str_justify(argc, argv, str, 'l');
10455}
10456
10457/*
10458 * call-seq:
10459 * rjust(size, pad_string = ' ') -> new_string
10460 *
10461 * :include: doc/string/rjust.rdoc
10462 *
10463 * Related: String#ljust, String#center.
10464 *
10465 */
10466
10467static VALUE
10468rb_str_rjust(int argc, VALUE *argv, VALUE str)
10469{
10470 return rb_str_justify(argc, argv, str, 'r');
10471}
10472
10473
10474/*
10475 * call-seq:
10476 * center(size, pad_string = ' ') -> new_string
10477 *
10478 * :include: doc/string/center.rdoc
10479 *
10480 * Related: String#ljust, String#rjust.
10481 *
10482 */
10483
10484static VALUE
10485rb_str_center(int argc, VALUE *argv, VALUE str)
10486{
10487 return rb_str_justify(argc, argv, str, 'c');
10488}
10489
10490/*
10491 * call-seq:
10492 * partition(string_or_regexp) -> [head, match, tail]
10493 *
10494 * :include: doc/string/partition.rdoc
10495 *
10496 */
10497
10498static VALUE
10499rb_str_partition(VALUE str, VALUE sep)
10500{
10501 long pos;
10502
10503 sep = get_pat_quoted(sep, 0);
10504 if (RB_TYPE_P(sep, T_REGEXP)) {
10505 if (rb_reg_search(sep, str, 0, 0) < 0) {
10506 goto failed;
10507 }
10508 VALUE match = rb_backref_get();
10509 struct re_registers *regs = RMATCH_REGS(match);
10510
10511 pos = BEG(0);
10512 sep = rb_str_subseq(str, pos, END(0) - pos);
10513 }
10514 else {
10515 pos = rb_str_index(str, sep, 0);
10516 if (pos < 0) goto failed;
10517 }
10518 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10519 sep,
10520 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10521 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10522
10523 failed:
10524 return rb_ary_new3(3, str_duplicate(rb_cString, str), str_new_empty_String(str), str_new_empty_String(str));
10525}
10526
10527/*
10528 * call-seq:
10529 * rpartition(sep) -> [head, match, tail]
10530 *
10531 * :include: doc/string/rpartition.rdoc
10532 *
10533 */
10534
10535static VALUE
10536rb_str_rpartition(VALUE str, VALUE sep)
10537{
10538 long pos = RSTRING_LEN(str);
10539
10540 sep = get_pat_quoted(sep, 0);
10541 if (RB_TYPE_P(sep, T_REGEXP)) {
10542 if (rb_reg_search(sep, str, pos, 1) < 0) {
10543 goto failed;
10544 }
10545 VALUE match = rb_backref_get();
10546 struct re_registers *regs = RMATCH_REGS(match);
10547
10548 pos = BEG(0);
10549 sep = rb_str_subseq(str, pos, END(0) - pos);
10550 }
10551 else {
10552 pos = rb_str_sublen(str, pos);
10553 pos = rb_str_rindex(str, sep, pos);
10554 if (pos < 0) {
10555 goto failed;
10556 }
10557 }
10558
10559 return rb_ary_new3(3, rb_str_subseq(str, 0, pos),
10560 sep,
10561 rb_str_subseq(str, pos+RSTRING_LEN(sep),
10562 RSTRING_LEN(str)-pos-RSTRING_LEN(sep)));
10563 failed:
10564 return rb_ary_new3(3, str_new_empty_String(str), str_new_empty_String(str), str_duplicate(rb_cString, str));
10565}
10566
10567/*
10568 * call-seq:
10569 * start_with?(*string_or_regexp) -> true or false
10570 *
10571 * :include: doc/string/start_with_p.rdoc
10572 *
10573 */
10574
10575static VALUE
10576rb_str_start_with(int argc, VALUE *argv, VALUE str)
10577{
10578 int i;
10579
10580 for (i=0; i<argc; i++) {
10581 VALUE tmp = argv[i];
10582 if (RB_TYPE_P(tmp, T_REGEXP)) {
10583 if (rb_reg_start_with_p(tmp, str))
10584 return Qtrue;
10585 }
10586 else {
10587 const char *p, *s, *e;
10588 long slen, tlen;
10589 rb_encoding *enc;
10590
10591 StringValue(tmp);
10592 enc = rb_enc_check(str, tmp);
10593 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10594 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10595 p = RSTRING_PTR(str);
10596 e = p + slen;
10597 s = p + tlen;
10598 if (!at_char_right_boundary(p, s, e, enc))
10599 continue;
10600 if (memcmp(p, RSTRING_PTR(tmp), tlen) == 0)
10601 return Qtrue;
10602 }
10603 }
10604 return Qfalse;
10605}
10606
10607/*
10608 * call-seq:
10609 * end_with?(*strings) -> true or false
10610 *
10611 * :include: doc/string/end_with_p.rdoc
10612 *
10613 */
10614
10615static VALUE
10616rb_str_end_with(int argc, VALUE *argv, VALUE str)
10617{
10618 int i;
10619
10620 for (i=0; i<argc; i++) {
10621 VALUE tmp = argv[i];
10622 const char *p, *s, *e;
10623 long slen, tlen;
10624 rb_encoding *enc;
10625
10626 StringValue(tmp);
10627 enc = rb_enc_check(str, tmp);
10628 if ((tlen = RSTRING_LEN(tmp)) == 0) return Qtrue;
10629 if ((slen = RSTRING_LEN(str)) < tlen) continue;
10630 p = RSTRING_PTR(str);
10631 e = p + slen;
10632 s = e - tlen;
10633 if (!at_char_boundary(p, s, e, enc))
10634 continue;
10635 if (memcmp(s, RSTRING_PTR(tmp), tlen) == 0)
10636 return Qtrue;
10637 }
10638 return Qfalse;
10639}
10640
10650static long
10651deleted_prefix_length(VALUE str, VALUE prefix)
10652{
10653 const char *strptr, *prefixptr;
10654 long olen, prefixlen;
10655 rb_encoding *enc = rb_enc_get(str);
10656
10657 StringValue(prefix);
10658
10659 if (!is_broken_string(prefix) ||
10660 !rb_enc_asciicompat(enc) ||
10661 !rb_enc_asciicompat(rb_enc_get(prefix))) {
10662 enc = rb_enc_check(str, prefix);
10663 }
10664
10665 /* return 0 if not start with prefix */
10666 prefixlen = RSTRING_LEN(prefix);
10667 if (prefixlen <= 0) return 0;
10668 olen = RSTRING_LEN(str);
10669 if (olen < prefixlen) return 0;
10670 strptr = RSTRING_PTR(str);
10671 prefixptr = RSTRING_PTR(prefix);
10672 if (memcmp(strptr, prefixptr, prefixlen) != 0) return 0;
10673 if (is_broken_string(prefix)) {
10674 if (!is_broken_string(str)) {
10675 /* prefix in a valid string cannot be broken */
10676 return 0;
10677 }
10678 const char *strend = strptr + olen;
10679 const char *after_prefix = strptr + prefixlen;
10680 if (!at_char_right_boundary(strptr, after_prefix, strend, enc)) {
10681 /* prefix does not end at char-boundary */
10682 return 0;
10683 }
10684 }
10685 /* prefix part in `str` also should be valid. */
10686
10687 return prefixlen;
10688}
10689
10690/*
10691 * call-seq:
10692 * delete_prefix!(prefix) -> self or nil
10693 *
10694 * Like String#delete_prefix, except that +self+ is modified in place.
10695 * Returns +self+ if the prefix is removed, +nil+ otherwise.
10696 *
10697 */
10698
10699static VALUE
10700rb_str_delete_prefix_bang(VALUE str, VALUE prefix)
10701{
10702 long prefixlen;
10703 str_modify_keep_cr(str);
10704
10705 prefixlen = deleted_prefix_length(str, prefix);
10706 if (prefixlen <= 0) return Qnil;
10707
10708 return rb_str_drop_bytes(str, prefixlen);
10709}
10710
10711/*
10712 * call-seq:
10713 * delete_prefix(prefix) -> new_string
10714 *
10715 * :include: doc/string/delete_prefix.rdoc
10716 *
10717 */
10718
10719static VALUE
10720rb_str_delete_prefix(VALUE str, VALUE prefix)
10721{
10722 long prefixlen;
10723
10724 prefixlen = deleted_prefix_length(str, prefix);
10725 if (prefixlen <= 0) return str_duplicate(rb_cString, str);
10726
10727 return rb_str_subseq(str, prefixlen, RSTRING_LEN(str) - prefixlen);
10728}
10729
10739static long
10740deleted_suffix_length(VALUE str, VALUE suffix)
10741{
10742 const char *strptr, *suffixptr;
10743 long olen, suffixlen;
10744 rb_encoding *enc;
10745
10746 StringValue(suffix);
10747 if (is_broken_string(suffix)) return 0;
10748 enc = rb_enc_check(str, suffix);
10749
10750 /* return 0 if not start with suffix */
10751 suffixlen = RSTRING_LEN(suffix);
10752 if (suffixlen <= 0) return 0;
10753 olen = RSTRING_LEN(str);
10754 if (olen < suffixlen) return 0;
10755 strptr = RSTRING_PTR(str);
10756 suffixptr = RSTRING_PTR(suffix);
10757 const char *strend = strptr + olen;
10758 const char *before_suffix = strend - suffixlen;
10759 if (memcmp(before_suffix, suffixptr, suffixlen) != 0) return 0;
10760 if (!at_char_boundary(strptr, before_suffix, strend, enc)) return 0;
10761
10762 return suffixlen;
10763}
10764
10765/*
10766 * call-seq:
10767 * delete_suffix!(suffix) -> self or nil
10768 *
10769 * Like String#delete_suffix, except that +self+ is modified in place.
10770 * Returns +self+ if the suffix is removed, +nil+ otherwise.
10771 *
10772 */
10773
10774static VALUE
10775rb_str_delete_suffix_bang(VALUE str, VALUE suffix)
10776{
10777 long olen, suffixlen, len;
10778 str_modifiable(str);
10779
10780 suffixlen = deleted_suffix_length(str, suffix);
10781 if (suffixlen <= 0) return Qnil;
10782
10783 olen = RSTRING_LEN(str);
10784 str_modify_keep_cr(str);
10785 len = olen - suffixlen;
10786 STR_SET_LEN(str, len);
10787 TERM_FILL(&RSTRING_PTR(str)[len], TERM_LEN(str));
10788 if (ENC_CODERANGE(str) != ENC_CODERANGE_7BIT) {
10790 }
10791 return str;
10792}
10793
10794/*
10795 * call-seq:
10796 * delete_suffix(suffix) -> new_string
10797 *
10798 * :include: doc/string/delete_suffix.rdoc
10799 *
10800 */
10801
10802static VALUE
10803rb_str_delete_suffix(VALUE str, VALUE suffix)
10804{
10805 long suffixlen;
10806
10807 suffixlen = deleted_suffix_length(str, suffix);
10808 if (suffixlen <= 0) return str_duplicate(rb_cString, str);
10809
10810 return rb_str_subseq(str, 0, RSTRING_LEN(str) - suffixlen);
10811}
10812
10813void
10814rb_str_setter(VALUE val, ID id, VALUE *var)
10815{
10816 if (!NIL_P(val) && !RB_TYPE_P(val, T_STRING)) {
10817 rb_raise(rb_eTypeError, "value of %"PRIsVALUE" must be String", rb_id2str(id));
10818 }
10819 *var = val;
10820}
10821
10822static void
10823rb_fs_setter(VALUE val, ID id, VALUE *var)
10824{
10825 val = rb_fs_check(val);
10826 if (!val) {
10827 rb_raise(rb_eTypeError,
10828 "value of %"PRIsVALUE" must be String or Regexp",
10829 rb_id2str(id));
10830 }
10831 if (!NIL_P(val)) {
10832 rb_warn_deprecated("`$;'", NULL);
10833 }
10834 *var = val;
10835}
10836
10837
10838/*
10839 * call-seq:
10840 * force_encoding(encoding) -> self
10841 *
10842 * :include: doc/string/force_encoding.rdoc
10843 *
10844 */
10845
10846static VALUE
10847rb_str_force_encoding(VALUE str, VALUE enc)
10848{
10849 str_modifiable(str);
10850
10851 rb_encoding *encoding = rb_to_encoding(enc);
10852 int idx = rb_enc_to_index(encoding);
10853
10854 // If the encoding is unchanged, we do nothing.
10855 if (ENCODING_GET(str) == idx) {
10856 return str;
10857 }
10858
10859 rb_enc_associate_index(str, idx);
10860
10861 // If the coderange was 7bit and the new encoding is ASCII-compatible
10862 // we can keep the coderange.
10863 if (ENC_CODERANGE(str) == ENC_CODERANGE_7BIT && encoding && rb_enc_asciicompat(encoding)) {
10864 return str;
10865 }
10866
10868 return str;
10869}
10870
10871/*
10872 * call-seq:
10873 * b -> string
10874 *
10875 * :include: doc/string/b.rdoc
10876 *
10877 */
10878
10879static VALUE
10880rb_str_b(VALUE str)
10881{
10882 VALUE str2;
10883 if (STR_EMBED_P(str)) {
10884 str2 = str_alloc_embed(rb_cString, RSTRING_LEN(str) + TERM_LEN(str));
10885 }
10886 else {
10887 str2 = str_alloc_heap(rb_cString);
10888 }
10889 str_replace_shared_without_enc(str2, str);
10890
10891 if (rb_enc_asciicompat(STR_ENC_GET(str))) {
10892 // BINARY strings can never be broken; they're either 7-bit ASCII or VALID.
10893 // If we know the receiver's code range then we know the result's code range.
10894 int cr = ENC_CODERANGE(str);
10895 switch (cr) {
10896 case ENC_CODERANGE_7BIT:
10898 break;
10902 break;
10903 default:
10904 ENC_CODERANGE_CLEAR(str2);
10905 break;
10906 }
10907 }
10908
10909 return str2;
10910}
10911
10912/*
10913 * call-seq:
10914 * valid_encoding? -> true or false
10915 *
10916 * Returns +true+ if +self+ is encoded correctly, +false+ otherwise:
10917 *
10918 * "\xc2\xa1".force_encoding("UTF-8").valid_encoding? # => true
10919 * "\xc2".force_encoding("UTF-8").valid_encoding? # => false
10920 * "\x80".force_encoding("UTF-8").valid_encoding? # => false
10921 */
10922
10923static VALUE
10924rb_str_valid_encoding_p(VALUE str)
10925{
10926 int cr = rb_enc_str_coderange(str);
10927
10928 return RBOOL(cr != ENC_CODERANGE_BROKEN);
10929}
10930
10931/*
10932 * call-seq:
10933 * ascii_only? -> true or false
10934 *
10935 * Returns +true+ if +self+ contains only ASCII characters,
10936 * +false+ otherwise:
10937 *
10938 * 'abc'.ascii_only? # => true
10939 * "abc\u{6666}".ascii_only? # => false
10940 *
10941 */
10942
10943static VALUE
10944rb_str_is_ascii_only_p(VALUE str)
10945{
10946 int cr = rb_enc_str_coderange(str);
10947
10948 return RBOOL(cr == ENC_CODERANGE_7BIT);
10949}
10950
10951VALUE
10953{
10954 static const char ellipsis[] = "...";
10955 const long ellipsislen = sizeof(ellipsis) - 1;
10956 rb_encoding *const enc = rb_enc_get(str);
10957 const long blen = RSTRING_LEN(str);
10958 const char *const p = RSTRING_PTR(str), *e = p + blen;
10959 VALUE estr, ret = 0;
10960
10961 if (len < 0) rb_raise(rb_eIndexError, "negative length %ld", len);
10962 if (len * rb_enc_mbminlen(enc) >= blen ||
10963 (e = rb_enc_nth(p, e, len, enc)) - p == blen) {
10964 ret = str;
10965 }
10966 else if (len <= ellipsislen ||
10967 !(e = rb_enc_step_back(p, e, e, len = ellipsislen, enc))) {
10968 if (rb_enc_asciicompat(enc)) {
10969 ret = rb_str_new(ellipsis, len);
10970 rb_enc_associate(ret, enc);
10971 }
10972 else {
10973 estr = rb_usascii_str_new(ellipsis, len);
10974 ret = rb_str_encode(estr, rb_enc_from_encoding(enc), 0, Qnil);
10975 }
10976 }
10977 else if (ret = rb_str_subseq(str, 0, e - p), rb_enc_asciicompat(enc)) {
10978 rb_str_cat(ret, ellipsis, ellipsislen);
10979 }
10980 else {
10981 estr = rb_str_encode(rb_usascii_str_new(ellipsis, ellipsislen),
10982 rb_enc_from_encoding(enc), 0, Qnil);
10983 rb_str_append(ret, estr);
10984 }
10985 return ret;
10986}
10987
10988static VALUE
10989str_compat_and_valid(VALUE str, rb_encoding *enc)
10990{
10991 int cr;
10992 str = StringValue(str);
10993 cr = rb_enc_str_coderange(str);
10994 if (cr == ENC_CODERANGE_BROKEN) {
10995 rb_raise(rb_eArgError, "replacement must be valid byte sequence '%+"PRIsVALUE"'", str);
10996 }
10997 else {
10998 rb_encoding *e = STR_ENC_GET(str);
10999 if (cr == ENC_CODERANGE_7BIT ? rb_enc_mbminlen(enc) != 1 : enc != e) {
11000 rb_raise(rb_eEncCompatError, "incompatible character encodings: %s and %s",
11001 rb_enc_name(enc), rb_enc_name(e));
11002 }
11003 }
11004 return str;
11005}
11006
11007static VALUE enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr);
11008
11009VALUE
11011{
11012 rb_encoding *enc = STR_ENC_GET(str);
11013 return enc_str_scrub(enc, str, repl, ENC_CODERANGE(str));
11014}
11015
11016VALUE
11017rb_enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl)
11018{
11019 int cr = ENC_CODERANGE_UNKNOWN;
11020 if (enc == STR_ENC_GET(str)) {
11021 /* cached coderange makes sense only when enc equals the
11022 * actual encoding of str */
11023 cr = ENC_CODERANGE(str);
11024 }
11025 return enc_str_scrub(enc, str, repl, cr);
11026}
11027
11028static VALUE
11029enc_str_scrub(rb_encoding *enc, VALUE str, VALUE repl, int cr)
11030{
11031 int encidx;
11032 VALUE buf = Qnil;
11033 const char *rep, *p, *e, *p1, *sp;
11034 long replen = -1;
11035 long slen;
11036
11037 if (rb_block_given_p()) {
11038 if (!NIL_P(repl))
11039 rb_raise(rb_eArgError, "both of block and replacement given");
11040 replen = 0;
11041 }
11042
11043 if (ENC_CODERANGE_CLEAN_P(cr))
11044 return Qnil;
11045
11046 if (!NIL_P(repl)) {
11047 repl = str_compat_and_valid(repl, enc);
11048 }
11049
11050 if (rb_enc_dummy_p(enc)) {
11051 return Qnil;
11052 }
11053 encidx = rb_enc_to_index(enc);
11054
11055#define DEFAULT_REPLACE_CHAR(str) do { \
11056 static const char replace[sizeof(str)-1] = str; \
11057 rep = replace; replen = (int)sizeof(replace); \
11058 } while (0)
11059
11060 slen = RSTRING_LEN(str);
11061 p = RSTRING_PTR(str);
11062 e = RSTRING_END(str);
11063 p1 = p;
11064 sp = p;
11065
11066 if (rb_enc_asciicompat(enc)) {
11067 int rep7bit_p;
11068 if (!replen) {
11069 rep = NULL;
11070 rep7bit_p = FALSE;
11071 }
11072 else if (!NIL_P(repl)) {
11073 rep = RSTRING_PTR(repl);
11074 replen = RSTRING_LEN(repl);
11075 rep7bit_p = (ENC_CODERANGE(repl) == ENC_CODERANGE_7BIT);
11076 }
11077 else if (encidx == rb_utf8_encindex()) {
11078 DEFAULT_REPLACE_CHAR("\xEF\xBF\xBD");
11079 rep7bit_p = FALSE;
11080 }
11081 else {
11082 DEFAULT_REPLACE_CHAR("?");
11083 rep7bit_p = TRUE;
11084 }
11085 cr = ENC_CODERANGE_7BIT;
11086
11087 p = search_nonascii(p, e);
11088 if (!p) {
11089 p = e;
11090 }
11091 while (p < e) {
11092 int ret = rb_enc_precise_mbclen(p, e, enc);
11093 if (MBCLEN_NEEDMORE_P(ret)) {
11094 break;
11095 }
11096 else if (MBCLEN_CHARFOUND_P(ret)) {
11098 p += MBCLEN_CHARFOUND_LEN(ret);
11099 }
11100 else if (MBCLEN_INVALID_P(ret)) {
11101 /*
11102 * p1~p: valid ascii/multibyte chars
11103 * p ~e: invalid bytes + unknown bytes
11104 */
11105 long clen = rb_enc_mbmaxlen(enc);
11106 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11107 if (p > p1) {
11108 rb_str_buf_cat(buf, p1, p - p1);
11109 }
11110
11111 if (e - p < clen) clen = e - p;
11112 if (clen <= 2) {
11113 clen = 1;
11114 }
11115 else {
11116 const char *q = p;
11117 clen--;
11118 for (; clen > 1; clen--) {
11119 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11120 if (MBCLEN_NEEDMORE_P(ret)) break;
11121 if (MBCLEN_INVALID_P(ret)) continue;
11123 }
11124 }
11125 if (rep) {
11126 rb_str_buf_cat(buf, rep, replen);
11127 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11128 }
11129 else {
11130 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11131 str_mod_check(str, sp, slen);
11132 repl = str_compat_and_valid(repl, enc);
11133 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11136 }
11137 p += clen;
11138 p1 = p;
11139 p = search_nonascii(p, e);
11140 if (!p) {
11141 p = e;
11142 break;
11143 }
11144 }
11145 else {
11147 }
11148 }
11149 if (NIL_P(buf)) {
11150 if (p == e) {
11151 ENC_CODERANGE_SET(str, cr);
11152 return Qnil;
11153 }
11154 buf = rb_str_buf_new(RSTRING_LEN(str));
11155 }
11156 if (p1 < p) {
11157 rb_str_buf_cat(buf, p1, p - p1);
11158 }
11159 if (p < e) {
11160 if (rep) {
11161 rb_str_buf_cat(buf, rep, replen);
11162 if (!rep7bit_p) cr = ENC_CODERANGE_VALID;
11163 }
11164 else {
11165 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11166 str_mod_check(str, sp, slen);
11167 repl = str_compat_and_valid(repl, enc);
11168 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11171 }
11172 }
11173 }
11174 else {
11175 /* ASCII incompatible */
11176 long mbminlen = rb_enc_mbminlen(enc);
11177 if (!replen) {
11178 rep = NULL;
11179 }
11180 else if (!NIL_P(repl)) {
11181 rep = RSTRING_PTR(repl);
11182 replen = RSTRING_LEN(repl);
11183 }
11184 else if (encidx == ENCINDEX_UTF_16BE) {
11185 DEFAULT_REPLACE_CHAR("\xFF\xFD");
11186 }
11187 else if (encidx == ENCINDEX_UTF_16LE) {
11188 DEFAULT_REPLACE_CHAR("\xFD\xFF");
11189 }
11190 else if (encidx == ENCINDEX_UTF_32BE) {
11191 DEFAULT_REPLACE_CHAR("\x00\x00\xFF\xFD");
11192 }
11193 else if (encidx == ENCINDEX_UTF_32LE) {
11194 DEFAULT_REPLACE_CHAR("\xFD\xFF\x00\x00");
11195 }
11196 else {
11197 DEFAULT_REPLACE_CHAR("?");
11198 }
11199
11200 while (p < e) {
11201 int ret = rb_enc_precise_mbclen(p, e, enc);
11202 if (MBCLEN_NEEDMORE_P(ret)) {
11203 break;
11204 }
11205 else if (MBCLEN_CHARFOUND_P(ret)) {
11206 p += MBCLEN_CHARFOUND_LEN(ret);
11207 }
11208 else if (MBCLEN_INVALID_P(ret)) {
11209 const char *q = p;
11210 long clen = rb_enc_mbmaxlen(enc);
11211 if (NIL_P(buf)) buf = rb_str_buf_new(RSTRING_LEN(str));
11212 if (p > p1) rb_str_buf_cat(buf, p1, p - p1);
11213
11214 if (e - p < clen) clen = e - p;
11215 if (clen <= mbminlen * 2) {
11216 clen = mbminlen;
11217 }
11218 else {
11219 clen -= mbminlen;
11220 for (; clen > mbminlen; clen-=mbminlen) {
11221 ret = rb_enc_precise_mbclen(q, q + clen, enc);
11222 if (MBCLEN_NEEDMORE_P(ret)) break;
11223 if (MBCLEN_INVALID_P(ret)) continue;
11225 }
11226 }
11227 if (rep) {
11228 rb_str_buf_cat(buf, rep, replen);
11229 }
11230 else {
11231 repl = rb_yield(rb_enc_str_new(p, clen, enc));
11232 str_mod_check(str, sp, slen);
11233 repl = str_compat_and_valid(repl, enc);
11234 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11235 }
11236 p += clen;
11237 p1 = p;
11238 }
11239 else {
11241 }
11242 }
11243 if (NIL_P(buf)) {
11244 if (p == e) {
11246 return Qnil;
11247 }
11248 buf = rb_str_buf_new(RSTRING_LEN(str));
11249 }
11250 if (p1 < p) {
11251 rb_str_buf_cat(buf, p1, p - p1);
11252 }
11253 if (p < e) {
11254 if (rep) {
11255 rb_str_buf_cat(buf, rep, replen);
11256 }
11257 else {
11258 repl = rb_yield(rb_enc_str_new(p, e-p, enc));
11259 str_mod_check(str, sp, slen);
11260 repl = str_compat_and_valid(repl, enc);
11261 rb_str_buf_cat(buf, RSTRING_PTR(repl), RSTRING_LEN(repl));
11262 }
11263 }
11265 }
11266 ENCODING_CODERANGE_SET(buf, rb_enc_to_index(enc), cr);
11267 return buf;
11268}
11269
11270/*
11271 * call-seq:
11272 * scrub(replacement_string = default_replacement) -> new_string
11273 * scrub{|bytes| ... } -> new_string
11274 *
11275 * :include: doc/string/scrub.rdoc
11276 *
11277 */
11278static VALUE
11279str_scrub(int argc, VALUE *argv, VALUE str)
11280{
11281 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11282 VALUE new = rb_str_scrub(str, repl);
11283 return NIL_P(new) ? str_duplicate(rb_cString, str): new;
11284}
11285
11286/*
11287 * call-seq:
11288 * scrub! -> self
11289 * scrub!(replacement_string = default_replacement) -> self
11290 * scrub!{|bytes| ... } -> self
11291 *
11292 * Like String#scrub, except that any replacements are made in +self+.
11293 *
11294 */
11295static VALUE
11296str_scrub_bang(int argc, VALUE *argv, VALUE str)
11297{
11298 VALUE repl = argc ? (rb_check_arity(argc, 0, 1), argv[0]) : Qnil;
11299 VALUE new = rb_str_scrub(str, repl);
11300 if (!NIL_P(new)) rb_str_replace(str, new);
11301 return str;
11302}
11303
11304static ID id_normalize;
11305static ID id_normalized_p;
11306static VALUE mUnicodeNormalize;
11307
11308static VALUE
11309unicode_normalize_common(int argc, VALUE *argv, VALUE str, ID id)
11310{
11311 static int UnicodeNormalizeRequired = 0;
11312 VALUE argv2[2];
11313
11314 if (!UnicodeNormalizeRequired) {
11315 rb_require("unicode_normalize/normalize.rb");
11316 UnicodeNormalizeRequired = 1;
11317 }
11318 argv2[0] = str;
11319 if (rb_check_arity(argc, 0, 1)) argv2[1] = argv[0];
11320 return rb_funcallv(mUnicodeNormalize, id, argc+1, argv2);
11321}
11322
11323/*
11324 * call-seq:
11325 * unicode_normalize(form = :nfc) -> string
11326 *
11327 * Returns a copy of +self+ with
11328 * {Unicode normalization}[https://unicode.org/reports/tr15] applied.
11329 *
11330 * Argument +form+ must be one of the following symbols
11331 * (see {Unicode normalization forms}[https://unicode.org/reports/tr15/#Norm_Forms]):
11332 *
11333 * - +:nfc+: Canonical decomposition, followed by canonical composition.
11334 * - +:nfd+: Canonical decomposition.
11335 * - +:nfkc+: Compatibility decomposition, followed by canonical composition.
11336 * - +:nfkd+: Compatibility decomposition.
11337 *
11338 * The encoding of +self+ must be one of:
11339 *
11340 * - Encoding::UTF_8
11341 * - Encoding::UTF_16BE
11342 * - Encoding::UTF_16LE
11343 * - Encoding::UTF_32BE
11344 * - Encoding::UTF_32LE
11345 * - Encoding::GB18030
11346 * - Encoding::UCS_2BE
11347 * - Encoding::UCS_4BE
11348 *
11349 * Examples:
11350 *
11351 * "a\u0300".unicode_normalize # => "a"
11352 * "\u00E0".unicode_normalize(:nfd) # => "a "
11353 *
11354 * Related: String#unicode_normalize!, String#unicode_normalized?.
11355 */
11356static VALUE
11357rb_str_unicode_normalize(int argc, VALUE *argv, VALUE str)
11358{
11359 return unicode_normalize_common(argc, argv, str, id_normalize);
11360}
11361
11362/*
11363 * call-seq:
11364 * unicode_normalize!(form = :nfc) -> self
11365 *
11366 * Like String#unicode_normalize, except that the normalization
11367 * is performed on +self+.
11368 *
11369 * Related String#unicode_normalized?.
11370 *
11371 */
11372static VALUE
11373rb_str_unicode_normalize_bang(int argc, VALUE *argv, VALUE str)
11374{
11375 return rb_str_replace(str, unicode_normalize_common(argc, argv, str, id_normalize));
11376}
11377
11378/* call-seq:
11379 * unicode_normalized?(form = :nfc) -> true or false
11380 *
11381 * Returns +true+ if +self+ is in the given +form+ of Unicode normalization,
11382 * +false+ otherwise.
11383 * The +form+ must be one of +:nfc+, +:nfd+, +:nfkc+, or +:nfkd+.
11384 *
11385 * Examples:
11386 *
11387 * "a\u0300".unicode_normalized? # => false
11388 * "a\u0300".unicode_normalized?(:nfd) # => true
11389 * "\u00E0".unicode_normalized? # => true
11390 * "\u00E0".unicode_normalized?(:nfd) # => false
11391 *
11392 *
11393 * Raises an exception if +self+ is not in a Unicode encoding:
11394 *
11395 * s = "\xE0".force_encoding('ISO-8859-1')
11396 * s.unicode_normalized? # Raises Encoding::CompatibilityError.
11397 *
11398 * Related: String#unicode_normalize, String#unicode_normalize!.
11399 *
11400 */
11401static VALUE
11402rb_str_unicode_normalized_p(int argc, VALUE *argv, VALUE str)
11403{
11404 return unicode_normalize_common(argc, argv, str, id_normalized_p);
11405}
11406
11407/**********************************************************************
11408 * Document-class: Symbol
11409 *
11410 * \Symbol objects represent named identifiers inside the Ruby interpreter.
11411 *
11412 * You can create a \Symbol object explicitly with:
11413 *
11414 * - A {symbol literal}[rdoc-ref:syntax/literals.rdoc@Symbol+Literals].
11415 *
11416 * The same \Symbol object will be
11417 * created for a given name or string for the duration of a program's
11418 * execution, regardless of the context or meaning of that name. Thus
11419 * if <code>Fred</code> is a constant in one context, a method in
11420 * another, and a class in a third, the \Symbol <code>:Fred</code>
11421 * will be the same object in all three contexts.
11422 *
11423 * module One
11424 * class Fred
11425 * end
11426 * $f1 = :Fred
11427 * end
11428 * module Two
11429 * Fred = 1
11430 * $f2 = :Fred
11431 * end
11432 * def Fred()
11433 * end
11434 * $f3 = :Fred
11435 * $f1.object_id #=> 2514190
11436 * $f2.object_id #=> 2514190
11437 * $f3.object_id #=> 2514190
11438 *
11439 * Constant, method, and variable names are returned as symbols:
11440 *
11441 * module One
11442 * Two = 2
11443 * def three; 3 end
11444 * @four = 4
11445 * @@five = 5
11446 * $six = 6
11447 * end
11448 * seven = 7
11449 *
11450 * One.constants
11451 * # => [:Two]
11452 * One.instance_methods(true)
11453 * # => [:three]
11454 * One.instance_variables
11455 * # => [:@four]
11456 * One.class_variables
11457 * # => [:@@five]
11458 * global_variables.grep(/six/)
11459 * # => [:$six]
11460 * local_variables
11461 * # => [:seven]
11462 *
11463 * \Symbol objects are different from String objects in that
11464 * \Symbol objects represent identifiers, while String objects
11465 * represent text or data.
11466 *
11467 * == What's Here
11468 *
11469 * First, what's elsewhere. \Class \Symbol:
11470 *
11471 * - Inherits from {class Object}[rdoc-ref:Object@What-27s+Here].
11472 * - Includes {module Comparable}[rdoc-ref:Comparable@What-27s+Here].
11473 *
11474 * Here, class \Symbol provides methods that are useful for:
11475 *
11476 * - {Querying}[rdoc-ref:Symbol@Methods+for+Querying]
11477 * - {Comparing}[rdoc-ref:Symbol@Methods+for+Comparing]
11478 * - {Converting}[rdoc-ref:Symbol@Methods+for+Converting]
11479 *
11480 * === Methods for Querying
11481 *
11482 * - ::all_symbols: Returns an array of the symbols currently in Ruby's symbol table.
11483 * - #=~: Returns the index of the first substring in symbol that matches a
11484 * given Regexp or other object; returns +nil+ if no match is found.
11485 * - #[], #slice : Returns a substring of symbol
11486 * determined by a given index, start/length, or range, or string.
11487 * - #empty?: Returns +true+ if +self.length+ is zero; +false+ otherwise.
11488 * - #encoding: Returns the Encoding object that represents the encoding
11489 * of symbol.
11490 * - #end_with?: Returns +true+ if symbol ends with
11491 * any of the given strings.
11492 * - #match: Returns a MatchData object if symbol
11493 * matches a given Regexp; +nil+ otherwise.
11494 * - #match?: Returns +true+ if symbol
11495 * matches a given Regexp; +false+ otherwise.
11496 * - #length, #size: Returns the number of characters in symbol.
11497 * - #start_with?: Returns +true+ if symbol starts with
11498 * any of the given strings.
11499 *
11500 * === Methods for Comparing
11501 *
11502 * - #<=>: Returns -1, 0, or 1 as a given symbol is smaller than, equal to,
11503 * or larger than symbol.
11504 * - #==, #===: Returns +true+ if a given symbol has the same content and
11505 * encoding.
11506 * - #casecmp: Ignoring case, returns -1, 0, or 1 as a given
11507 * symbol is smaller than, equal to, or larger than symbol.
11508 * - #casecmp?: Returns +true+ if symbol is equal to a given symbol
11509 * after Unicode case folding; +false+ otherwise.
11510 *
11511 * === Methods for Converting
11512 *
11513 * - #capitalize: Returns symbol with the first character upcased
11514 * and all other characters downcased.
11515 * - #downcase: Returns symbol with all characters downcased.
11516 * - #inspect: Returns the string representation of +self+ as a symbol literal.
11517 * - #name: Returns the frozen string corresponding to symbol.
11518 * - #succ, #next: Returns the symbol that is the successor to symbol.
11519 * - #swapcase: Returns symbol with all upcase characters downcased
11520 * and all downcase characters upcased.
11521 * - #to_proc: Returns a Proc object which responds to the method named by symbol.
11522 * - #to_s, #id2name: Returns the string corresponding to +self+.
11523 * - #to_sym, #intern: Returns +self+.
11524 * - #upcase: Returns symbol with all characters upcased.
11525 *
11526 */
11527
11528
11529/*
11530 * call-seq:
11531 * symbol == object -> true or false
11532 *
11533 * Returns +true+ if +object+ is the same object as +self+, +false+ otherwise.
11534 */
11535
11536#define sym_equal rb_obj_equal
11537
11538static int
11539sym_printable(const char *s, const char *send, rb_encoding *enc)
11540{
11541 while (s < send) {
11542 int n;
11543 int c = rb_enc_precise_mbclen(s, send, enc);
11544
11545 if (!MBCLEN_CHARFOUND_P(c)) return FALSE;
11546 n = MBCLEN_CHARFOUND_LEN(c);
11547 c = rb_enc_mbc_to_codepoint(s, send, enc);
11548 if (!rb_enc_isprint(c, enc)) return FALSE;
11549 s += n;
11550 }
11551 return TRUE;
11552}
11553
11554int
11555rb_str_symname_p(VALUE sym)
11556{
11557 rb_encoding *enc;
11558 const char *ptr;
11559 long len;
11560 rb_encoding *resenc = rb_default_internal_encoding();
11561
11562 if (resenc == NULL) resenc = rb_default_external_encoding();
11563 enc = STR_ENC_GET(sym);
11564 ptr = RSTRING_PTR(sym);
11565 len = RSTRING_LEN(sym);
11566 if ((resenc != enc && !rb_str_is_ascii_only_p(sym)) || len != (long)strlen(ptr) ||
11567 !rb_enc_symname2_p(ptr, len, enc) || !sym_printable(ptr, ptr + len, enc)) {
11568 return FALSE;
11569 }
11570 return TRUE;
11571}
11572
11573VALUE
11574rb_str_quote_unprintable(VALUE str)
11575{
11576 rb_encoding *enc;
11577 const char *ptr;
11578 long len;
11579 rb_encoding *resenc;
11580
11581 Check_Type(str, T_STRING);
11582 resenc = rb_default_internal_encoding();
11583 if (resenc == NULL) resenc = rb_default_external_encoding();
11584 enc = STR_ENC_GET(str);
11585 ptr = RSTRING_PTR(str);
11586 len = RSTRING_LEN(str);
11587 if ((resenc != enc && !rb_str_is_ascii_only_p(str)) ||
11588 !sym_printable(ptr, ptr + len, enc)) {
11589 return rb_str_escape(str);
11590 }
11591 return str;
11592}
11593
11594VALUE
11595rb_id_quote_unprintable(ID id)
11596{
11597 VALUE str = rb_id2str(id);
11598 if (!rb_str_symname_p(str)) {
11599 return rb_str_escape(str);
11600 }
11601 return str;
11602}
11603
11604/*
11605 * call-seq:
11606 * inspect -> string
11607 *
11608 * Returns a string representation of +self+ (including the leading colon):
11609 *
11610 * :foo.inspect # => ":foo"
11611 *
11612 * Related: Symbol#to_s, Symbol#name.
11613 *
11614 */
11615
11616static VALUE
11617sym_inspect(VALUE sym)
11618{
11619 VALUE str = rb_sym2str(sym);
11620 const char *ptr;
11621 long len;
11622 char *dest;
11623
11624 if (!rb_str_symname_p(str)) {
11625 str = rb_str_inspect(str);
11626 len = RSTRING_LEN(str);
11627 rb_str_resize(str, len + 1);
11628 dest = RSTRING_PTR(str);
11629 memmove(dest + 1, dest, len);
11630 }
11631 else {
11632 rb_encoding *enc = STR_ENC_GET(str);
11633
11634 VALUE orig_str = str;
11635 RSTRING_GETMEM(orig_str, ptr, len);
11636
11637 str = rb_enc_str_new(0, len + 1, enc);
11638 dest = RSTRING_PTR(str);
11639 memcpy(dest + 1, ptr, len);
11640
11641 RB_GC_GUARD(orig_str);
11642 }
11643 dest[0] = ':';
11644 return str;
11645}
11646
11647/*
11648 * call-seq:
11649 * to_s -> string
11650 *
11651 * Returns a string representation of +self+ (not including the leading colon):
11652 *
11653 * :foo.to_s # => "foo"
11654 *
11655 * Related: Symbol#inspect, Symbol#name.
11656 */
11657
11658VALUE
11660{
11661 return str_new_shared(rb_cString, rb_sym2str(sym));
11662}
11663
11664VALUE
11665rb_sym_proc_call(ID mid, int argc, const VALUE *argv, int kw_splat, VALUE passed_proc)
11666{
11667 VALUE obj;
11668
11669 if (argc < 1) {
11670 rb_raise(rb_eArgError, "no receiver given");
11671 }
11672 obj = argv[0];
11673 return rb_funcall_with_block_kw(obj, mid, argc - 1, argv + 1, passed_proc, kw_splat);
11674}
11675
11676/*
11677 * call-seq:
11678 * succ
11679 *
11680 * Equivalent to <tt>self.to_s.succ.to_sym</tt>:
11681 *
11682 * :foo.succ # => :fop
11683 *
11684 * Related: String#succ.
11685 */
11686
11687static VALUE
11688sym_succ(VALUE sym)
11689{
11690 return rb_str_intern(rb_str_succ(rb_sym2str(sym)));
11691}
11692
11693/*
11694 * call-seq:
11695 * symbol <=> object -> -1, 0, +1, or nil
11696 *
11697 * If +object+ is a symbol,
11698 * returns the equivalent of <tt>symbol.to_s <=> object.to_s</tt>:
11699 *
11700 * :bar <=> :foo # => -1
11701 * :foo <=> :foo # => 0
11702 * :foo <=> :bar # => 1
11703 *
11704 * Otherwise, returns +nil+:
11705 *
11706 * :foo <=> 'bar' # => nil
11707 *
11708 * Related: String#<=>.
11709 */
11710
11711static VALUE
11712sym_cmp(VALUE sym, VALUE other)
11713{
11714 if (!SYMBOL_P(other)) {
11715 return Qnil;
11716 }
11717 return rb_str_cmp_m(rb_sym2str(sym), rb_sym2str(other));
11718}
11719
11720/*
11721 * call-seq:
11722 * casecmp(object) -> -1, 0, 1, or nil
11723 *
11724 * :include: doc/symbol/casecmp.rdoc
11725 *
11726 */
11727
11728static VALUE
11729sym_casecmp(VALUE sym, VALUE other)
11730{
11731 if (!SYMBOL_P(other)) {
11732 return Qnil;
11733 }
11734 return str_casecmp(rb_sym2str(sym), rb_sym2str(other));
11735}
11736
11737/*
11738 * call-seq:
11739 * casecmp?(object) -> true, false, or nil
11740 *
11741 * :include: doc/symbol/casecmp_p.rdoc
11742 *
11743 */
11744
11745static VALUE
11746sym_casecmp_p(VALUE sym, VALUE other)
11747{
11748 if (!SYMBOL_P(other)) {
11749 return Qnil;
11750 }
11751 return str_casecmp_p(rb_sym2str(sym), rb_sym2str(other));
11752}
11753
11754/*
11755 * call-seq:
11756 * symbol =~ object -> integer or nil
11757 *
11758 * Equivalent to <tt>symbol.to_s =~ object</tt>,
11759 * including possible updates to global variables;
11760 * see String#=~.
11761 *
11762 */
11763
11764static VALUE
11765sym_match(VALUE sym, VALUE other)
11766{
11767 return rb_str_match(rb_sym2str(sym), other);
11768}
11769
11770/*
11771 * call-seq:
11772 * match(pattern, offset = 0) -> matchdata or nil
11773 * match(pattern, offset = 0) {|matchdata| } -> object
11774 *
11775 * Equivalent to <tt>self.to_s.match</tt>,
11776 * including possible updates to global variables;
11777 * see String#match.
11778 *
11779 */
11780
11781static VALUE
11782sym_match_m(int argc, VALUE *argv, VALUE sym)
11783{
11784 return rb_str_match_m(argc, argv, rb_sym2str(sym));
11785}
11786
11787/*
11788 * call-seq:
11789 * match?(pattern, offset) -> true or false
11790 *
11791 * Equivalent to <tt>sym.to_s.match?</tt>;
11792 * see String#match.
11793 *
11794 */
11795
11796static VALUE
11797sym_match_m_p(int argc, VALUE *argv, VALUE sym)
11798{
11799 return rb_str_match_m_p(argc, argv, sym);
11800}
11801
11802/*
11803 * call-seq:
11804 * symbol[index] -> string or nil
11805 * symbol[start, length] -> string or nil
11806 * symbol[range] -> string or nil
11807 * symbol[regexp, capture = 0] -> string or nil
11808 * symbol[substring] -> string or nil
11809 *
11810 * Equivalent to <tt>symbol.to_s[]</tt>; see String#[].
11811 *
11812 */
11813
11814static VALUE
11815sym_aref(int argc, VALUE *argv, VALUE sym)
11816{
11817 return rb_str_aref_m(argc, argv, rb_sym2str(sym));
11818}
11819
11820/*
11821 * call-seq:
11822 * length -> integer
11823 *
11824 * Equivalent to <tt>self.to_s.length</tt>; see String#length.
11825 */
11826
11827static VALUE
11828sym_length(VALUE sym)
11829{
11830 return rb_str_length(rb_sym2str(sym));
11831}
11832
11833/*
11834 * call-seq:
11835 * empty? -> true or false
11836 *
11837 * Returns +true+ if +self+ is <tt>:''</tt>, +false+ otherwise.
11838 *
11839 */
11840
11841static VALUE
11842sym_empty(VALUE sym)
11843{
11844 return rb_str_empty(rb_sym2str(sym));
11845}
11846
11847/*
11848 * call-seq:
11849 * upcase(*options) -> symbol
11850 *
11851 * Equivalent to <tt>sym.to_s.upcase.to_sym</tt>.
11852 *
11853 * See String#upcase.
11854 *
11855 */
11856
11857static VALUE
11858sym_upcase(int argc, VALUE *argv, VALUE sym)
11859{
11860 return rb_str_intern(rb_str_upcase(argc, argv, rb_sym2str(sym)));
11861}
11862
11863/*
11864 * call-seq:
11865 * downcase(*options) -> symbol
11866 *
11867 * Equivalent to <tt>sym.to_s.downcase.to_sym</tt>.
11868 *
11869 * See String#downcase.
11870 *
11871 * Related: Symbol#upcase.
11872 *
11873 */
11874
11875static VALUE
11876sym_downcase(int argc, VALUE *argv, VALUE sym)
11877{
11878 return rb_str_intern(rb_str_downcase(argc, argv, rb_sym2str(sym)));
11879}
11880
11881/*
11882 * call-seq:
11883 * capitalize(*options) -> symbol
11884 *
11885 * Equivalent to <tt>sym.to_s.capitalize.to_sym</tt>.
11886 *
11887 * See String#capitalize.
11888 *
11889 */
11890
11891static VALUE
11892sym_capitalize(int argc, VALUE *argv, VALUE sym)
11893{
11894 return rb_str_intern(rb_str_capitalize(argc, argv, rb_sym2str(sym)));
11895}
11896
11897/*
11898 * call-seq:
11899 * swapcase(*options) -> symbol
11900 *
11901 * Equivalent to <tt>sym.to_s.swapcase.to_sym</tt>.
11902 *
11903 * See String#swapcase.
11904 *
11905 */
11906
11907static VALUE
11908sym_swapcase(int argc, VALUE *argv, VALUE sym)
11909{
11910 return rb_str_intern(rb_str_swapcase(argc, argv, rb_sym2str(sym)));
11911}
11912
11913/*
11914 * call-seq:
11915 * start_with?(*string_or_regexp) -> true or false
11916 *
11917 * Equivalent to <tt>self.to_s.start_with?</tt>; see String#start_with?.
11918 *
11919 */
11920
11921static VALUE
11922sym_start_with(int argc, VALUE *argv, VALUE sym)
11923{
11924 return rb_str_start_with(argc, argv, rb_sym2str(sym));
11925}
11926
11927/*
11928 * call-seq:
11929 * end_with?(*strings) -> true or false
11930 *
11931 *
11932 * Equivalent to <tt>self.to_s.end_with?</tt>; see String#end_with?.
11933 *
11934 */
11935
11936static VALUE
11937sym_end_with(int argc, VALUE *argv, VALUE sym)
11938{
11939 return rb_str_end_with(argc, argv, rb_sym2str(sym));
11940}
11941
11942/*
11943 * call-seq:
11944 * encoding -> encoding
11945 *
11946 * Equivalent to <tt>self.to_s.encoding</tt>; see String#encoding.
11947 *
11948 */
11949
11950static VALUE
11951sym_encoding(VALUE sym)
11952{
11953 return rb_obj_encoding(rb_sym2str(sym));
11954}
11955
11956static VALUE
11957string_for_symbol(VALUE name)
11958{
11959 if (!RB_TYPE_P(name, T_STRING)) {
11960 VALUE tmp = rb_check_string_type(name);
11961 if (NIL_P(tmp)) {
11962 rb_raise(rb_eTypeError, "%+"PRIsVALUE" is not a symbol",
11963 name);
11964 }
11965 name = tmp;
11966 }
11967 return name;
11968}
11969
11970ID
11972{
11973 if (SYMBOL_P(name)) {
11974 return SYM2ID(name);
11975 }
11976 name = string_for_symbol(name);
11977 return rb_intern_str(name);
11978}
11979
11980VALUE
11982{
11983 if (SYMBOL_P(name)) {
11984 return name;
11985 }
11986 name = string_for_symbol(name);
11987 return rb_str_intern(name);
11988}
11989
11990/*
11991 * call-seq:
11992 * Symbol.all_symbols -> array_of_symbols
11993 *
11994 * Returns an array of all symbols currently in Ruby's symbol table:
11995 *
11996 * Symbol.all_symbols.size # => 9334
11997 * Symbol.all_symbols.take(3) # => [:!, :"\"", :"#"]
11998 *
11999 */
12000
12001static VALUE
12002sym_all_symbols(VALUE _)
12003{
12004 return rb_sym_all_symbols();
12005}
12006
12007VALUE
12009{
12010 return rb_fstring(str);
12011}
12012
12013VALUE
12014rb_interned_str(const char *ptr, long len)
12015{
12016 struct RString fake_str;
12017 return register_fstring(setup_fake_str(&fake_str, ptr, len, ENCINDEX_US_ASCII), TRUE);
12018}
12019
12020VALUE
12022{
12023 return rb_interned_str(ptr, strlen(ptr));
12024}
12025
12026VALUE
12027rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
12028{
12029 if (UNLIKELY(rb_enc_autoload_p(enc))) {
12030 rb_enc_autoload(enc);
12031 }
12032
12033 struct RString fake_str;
12034 return register_fstring(rb_setup_fake_str(&fake_str, ptr, len, enc), TRUE);
12035}
12036
12037VALUE
12039{
12040 return rb_enc_interned_str(ptr, strlen(ptr), enc);
12041}
12042
12043void
12044Init_String(void)
12045{
12046 rb_cString = rb_define_class("String", rb_cObject);
12047 assert(rb_vm_fstring_table());
12048 st_foreach(rb_vm_fstring_table(), fstring_set_class_i, rb_cString);
12050 rb_define_alloc_func(rb_cString, empty_str_alloc);
12051 rb_define_singleton_method(rb_cString, "new", rb_str_s_new, -1);
12052 rb_define_singleton_method(rb_cString, "try_convert", rb_str_s_try_convert, 1);
12053 rb_define_method(rb_cString, "initialize", rb_str_init, -1);
12054 rb_define_method(rb_cString, "initialize_copy", rb_str_replace, 1);
12055 rb_define_method(rb_cString, "<=>", rb_str_cmp_m, 1);
12058 rb_define_method(rb_cString, "eql?", rb_str_eql, 1);
12059 rb_define_method(rb_cString, "hash", rb_str_hash_m, 0);
12060 rb_define_method(rb_cString, "casecmp", rb_str_casecmp, 1);
12061 rb_define_method(rb_cString, "casecmp?", rb_str_casecmp_p, 1);
12064 rb_define_method(rb_cString, "%", rb_str_format_m, 1);
12065 rb_define_method(rb_cString, "[]", rb_str_aref_m, -1);
12066 rb_define_method(rb_cString, "[]=", rb_str_aset_m, -1);
12067 rb_define_method(rb_cString, "insert", rb_str_insert, 2);
12070 rb_define_method(rb_cString, "bytesize", rb_str_bytesize, 0);
12071 rb_define_method(rb_cString, "empty?", rb_str_empty, 0);
12072 rb_define_method(rb_cString, "=~", rb_str_match, 1);
12073 rb_define_method(rb_cString, "match", rb_str_match_m, -1);
12074 rb_define_method(rb_cString, "match?", rb_str_match_m_p, -1);
12076 rb_define_method(rb_cString, "succ!", rb_str_succ_bang, 0);
12078 rb_define_method(rb_cString, "next!", rb_str_succ_bang, 0);
12079 rb_define_method(rb_cString, "upto", rb_str_upto, -1);
12080 rb_define_method(rb_cString, "index", rb_str_index_m, -1);
12081 rb_define_method(rb_cString, "byteindex", rb_str_byteindex_m, -1);
12082 rb_define_method(rb_cString, "rindex", rb_str_rindex_m, -1);
12083 rb_define_method(rb_cString, "byterindex", rb_str_byterindex_m, -1);
12085 rb_define_method(rb_cString, "clear", rb_str_clear, 0);
12086 rb_define_method(rb_cString, "chr", rb_str_chr, 0);
12087 rb_define_method(rb_cString, "getbyte", rb_str_getbyte, 1);
12088 rb_define_method(rb_cString, "setbyte", rb_str_setbyte, 2);
12089 rb_define_method(rb_cString, "byteslice", rb_str_byteslice, -1);
12090 rb_define_method(rb_cString, "bytesplice", rb_str_bytesplice, -1);
12091 rb_define_method(rb_cString, "scrub", str_scrub, -1);
12092 rb_define_method(rb_cString, "scrub!", str_scrub_bang, -1);
12094 rb_define_method(rb_cString, "+@", str_uplus, 0);
12095 rb_define_method(rb_cString, "-@", str_uminus, 0);
12096 rb_define_method(rb_cString, "dup", rb_str_dup_m, 0);
12097 rb_define_alias(rb_cString, "dedup", "-@");
12098
12099 rb_define_method(rb_cString, "to_i", rb_str_to_i, -1);
12100 rb_define_method(rb_cString, "to_f", rb_str_to_f, 0);
12101 rb_define_method(rb_cString, "to_s", rb_str_to_s, 0);
12102 rb_define_method(rb_cString, "to_str", rb_str_to_s, 0);
12105 rb_define_method(rb_cString, "undump", str_undump, 0);
12106
12107 sym_ascii = ID2SYM(rb_intern_const("ascii"));
12108 sym_turkic = ID2SYM(rb_intern_const("turkic"));
12109 sym_lithuanian = ID2SYM(rb_intern_const("lithuanian"));
12110 sym_fold = ID2SYM(rb_intern_const("fold"));
12111
12112 rb_define_method(rb_cString, "upcase", rb_str_upcase, -1);
12113 rb_define_method(rb_cString, "downcase", rb_str_downcase, -1);
12114 rb_define_method(rb_cString, "capitalize", rb_str_capitalize, -1);
12115 rb_define_method(rb_cString, "swapcase", rb_str_swapcase, -1);
12116
12117 rb_define_method(rb_cString, "upcase!", rb_str_upcase_bang, -1);
12118 rb_define_method(rb_cString, "downcase!", rb_str_downcase_bang, -1);
12119 rb_define_method(rb_cString, "capitalize!", rb_str_capitalize_bang, -1);
12120 rb_define_method(rb_cString, "swapcase!", rb_str_swapcase_bang, -1);
12121
12122 rb_define_method(rb_cString, "hex", rb_str_hex, 0);
12123 rb_define_method(rb_cString, "oct", rb_str_oct, 0);
12124 rb_define_method(rb_cString, "split", rb_str_split_m, -1);
12125 rb_define_method(rb_cString, "lines", rb_str_lines, -1);
12126 rb_define_method(rb_cString, "bytes", rb_str_bytes, 0);
12127 rb_define_method(rb_cString, "chars", rb_str_chars, 0);
12128 rb_define_method(rb_cString, "codepoints", rb_str_codepoints, 0);
12129 rb_define_method(rb_cString, "grapheme_clusters", rb_str_grapheme_clusters, 0);
12130 rb_define_method(rb_cString, "reverse", rb_str_reverse, 0);
12131 rb_define_method(rb_cString, "reverse!", rb_str_reverse_bang, 0);
12132 rb_define_method(rb_cString, "concat", rb_str_concat_multi, -1);
12134 rb_define_method(rb_cString, "prepend", rb_str_prepend_multi, -1);
12135 rb_define_method(rb_cString, "crypt", rb_str_crypt, 1);
12136 rb_define_method(rb_cString, "intern", rb_str_intern, 0); /* in symbol.c */
12137 rb_define_method(rb_cString, "to_sym", rb_str_intern, 0); /* in symbol.c */
12138 rb_define_method(rb_cString, "ord", rb_str_ord, 0);
12139
12140 rb_define_method(rb_cString, "include?", rb_str_include, 1);
12141 rb_define_method(rb_cString, "start_with?", rb_str_start_with, -1);
12142 rb_define_method(rb_cString, "end_with?", rb_str_end_with, -1);
12143
12144 rb_define_method(rb_cString, "scan", rb_str_scan, 1);
12145
12146 rb_define_method(rb_cString, "ljust", rb_str_ljust, -1);
12147 rb_define_method(rb_cString, "rjust", rb_str_rjust, -1);
12148 rb_define_method(rb_cString, "center", rb_str_center, -1);
12149
12150 rb_define_method(rb_cString, "sub", rb_str_sub, -1);
12151 rb_define_method(rb_cString, "gsub", rb_str_gsub, -1);
12152 rb_define_method(rb_cString, "chop", rb_str_chop, 0);
12153 rb_define_method(rb_cString, "chomp", rb_str_chomp, -1);
12154 rb_define_method(rb_cString, "strip", rb_str_strip, 0);
12155 rb_define_method(rb_cString, "lstrip", rb_str_lstrip, 0);
12156 rb_define_method(rb_cString, "rstrip", rb_str_rstrip, 0);
12157 rb_define_method(rb_cString, "delete_prefix", rb_str_delete_prefix, 1);
12158 rb_define_method(rb_cString, "delete_suffix", rb_str_delete_suffix, 1);
12159
12160 rb_define_method(rb_cString, "sub!", rb_str_sub_bang, -1);
12161 rb_define_method(rb_cString, "gsub!", rb_str_gsub_bang, -1);
12162 rb_define_method(rb_cString, "chop!", rb_str_chop_bang, 0);
12163 rb_define_method(rb_cString, "chomp!", rb_str_chomp_bang, -1);
12164 rb_define_method(rb_cString, "strip!", rb_str_strip_bang, 0);
12165 rb_define_method(rb_cString, "lstrip!", rb_str_lstrip_bang, 0);
12166 rb_define_method(rb_cString, "rstrip!", rb_str_rstrip_bang, 0);
12167 rb_define_method(rb_cString, "delete_prefix!", rb_str_delete_prefix_bang, 1);
12168 rb_define_method(rb_cString, "delete_suffix!", rb_str_delete_suffix_bang, 1);
12169
12170 rb_define_method(rb_cString, "tr", rb_str_tr, 2);
12171 rb_define_method(rb_cString, "tr_s", rb_str_tr_s, 2);
12172 rb_define_method(rb_cString, "delete", rb_str_delete, -1);
12173 rb_define_method(rb_cString, "squeeze", rb_str_squeeze, -1);
12174 rb_define_method(rb_cString, "count", rb_str_count, -1);
12175
12176 rb_define_method(rb_cString, "tr!", rb_str_tr_bang, 2);
12177 rb_define_method(rb_cString, "tr_s!", rb_str_tr_s_bang, 2);
12178 rb_define_method(rb_cString, "delete!", rb_str_delete_bang, -1);
12179 rb_define_method(rb_cString, "squeeze!", rb_str_squeeze_bang, -1);
12180
12181 rb_define_method(rb_cString, "each_line", rb_str_each_line, -1);
12182 rb_define_method(rb_cString, "each_byte", rb_str_each_byte, 0);
12183 rb_define_method(rb_cString, "each_char", rb_str_each_char, 0);
12184 rb_define_method(rb_cString, "each_codepoint", rb_str_each_codepoint, 0);
12185 rb_define_method(rb_cString, "each_grapheme_cluster", rb_str_each_grapheme_cluster, 0);
12186
12187 rb_define_method(rb_cString, "sum", rb_str_sum, -1);
12188
12189 rb_define_method(rb_cString, "slice", rb_str_aref_m, -1);
12190 rb_define_method(rb_cString, "slice!", rb_str_slice_bang, -1);
12191
12192 rb_define_method(rb_cString, "partition", rb_str_partition, 1);
12193 rb_define_method(rb_cString, "rpartition", rb_str_rpartition, 1);
12194
12195 rb_define_method(rb_cString, "encoding", rb_obj_encoding, 0); /* in encoding.c */
12196 rb_define_method(rb_cString, "force_encoding", rb_str_force_encoding, 1);
12197 rb_define_method(rb_cString, "b", rb_str_b, 0);
12198 rb_define_method(rb_cString, "valid_encoding?", rb_str_valid_encoding_p, 0);
12199 rb_define_method(rb_cString, "ascii_only?", rb_str_is_ascii_only_p, 0);
12200
12201 /* define UnicodeNormalize module here so that we don't have to look it up */
12202 mUnicodeNormalize = rb_define_module("UnicodeNormalize");
12203 id_normalize = rb_intern_const("normalize");
12204 id_normalized_p = rb_intern_const("normalized?");
12205
12206 rb_define_method(rb_cString, "unicode_normalize", rb_str_unicode_normalize, -1);
12207 rb_define_method(rb_cString, "unicode_normalize!", rb_str_unicode_normalize_bang, -1);
12208 rb_define_method(rb_cString, "unicode_normalized?", rb_str_unicode_normalized_p, -1);
12209
12210 rb_fs = Qnil;
12211 rb_define_hooked_variable("$;", &rb_fs, 0, rb_fs_setter);
12212 rb_define_hooked_variable("$-F", &rb_fs, 0, rb_fs_setter);
12213 rb_gc_register_address(&rb_fs);
12214
12215 rb_cSymbol = rb_define_class("Symbol", rb_cObject);
12219 rb_define_singleton_method(rb_cSymbol, "all_symbols", sym_all_symbols, 0);
12220
12221 rb_define_method(rb_cSymbol, "==", sym_equal, 1);
12222 rb_define_method(rb_cSymbol, "===", sym_equal, 1);
12223 rb_define_method(rb_cSymbol, "inspect", sym_inspect, 0);
12225 rb_define_method(rb_cSymbol, "id2name", rb_sym_to_s, 0);
12226 rb_define_method(rb_cSymbol, "name", rb_sym2str, 0); /* in symbol.c */
12227 rb_define_method(rb_cSymbol, "to_proc", rb_sym_to_proc, 0); /* in proc.c */
12228 rb_define_method(rb_cSymbol, "succ", sym_succ, 0);
12229 rb_define_method(rb_cSymbol, "next", sym_succ, 0);
12230
12231 rb_define_method(rb_cSymbol, "<=>", sym_cmp, 1);
12232 rb_define_method(rb_cSymbol, "casecmp", sym_casecmp, 1);
12233 rb_define_method(rb_cSymbol, "casecmp?", sym_casecmp_p, 1);
12234 rb_define_method(rb_cSymbol, "=~", sym_match, 1);
12235
12236 rb_define_method(rb_cSymbol, "[]", sym_aref, -1);
12237 rb_define_method(rb_cSymbol, "slice", sym_aref, -1);
12238 rb_define_method(rb_cSymbol, "length", sym_length, 0);
12239 rb_define_method(rb_cSymbol, "size", sym_length, 0);
12240 rb_define_method(rb_cSymbol, "empty?", sym_empty, 0);
12241 rb_define_method(rb_cSymbol, "match", sym_match_m, -1);
12242 rb_define_method(rb_cSymbol, "match?", sym_match_m_p, -1);
12243
12244 rb_define_method(rb_cSymbol, "upcase", sym_upcase, -1);
12245 rb_define_method(rb_cSymbol, "downcase", sym_downcase, -1);
12246 rb_define_method(rb_cSymbol, "capitalize", sym_capitalize, -1);
12247 rb_define_method(rb_cSymbol, "swapcase", sym_swapcase, -1);
12248
12249 rb_define_method(rb_cSymbol, "start_with?", sym_start_with, -1);
12250 rb_define_method(rb_cSymbol, "end_with?", sym_end_with, -1);
12251
12252 rb_define_method(rb_cSymbol, "encoding", sym_encoding, 0);
12253}
#define RUBY_ASSERT(expr)
Asserts that the given expression is truthy if and only if RUBY_DEBUG is truthy.
Definition assert.h:177
#define RUBY_ASSERT_ALWAYS(expr)
A variant of RUBY_ASSERT that does not interface with RUBY_DEBUG.
Definition assert.h:167
Atomic operations.
static enum ruby_coderange_type RB_ENC_CODERANGE_AND(enum ruby_coderange_type a, enum ruby_coderange_type b)
"Mix" two code ranges into one.
Definition coderange.h:162
static int rb_isspace(int c)
Our own locale-insensitive version of isspace(3).
Definition ctype.h:395
#define rb_define_method(klass, mid, func, arity)
Defines klass#mid.
#define rb_define_singleton_method(klass, mid, func, arity)
Defines klass.mid.
static bool rb_enc_isascii(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isascii(), except it additionally takes an encoding.
Definition ctype.h:82
static bool rb_enc_is_newline(const char *p, const char *e, rb_encoding *enc)
Queries if the passed pointer points to a newline character.
Definition ctype.h:43
static bool rb_enc_isprint(OnigCodePoint c, rb_encoding *enc)
Identical to rb_isprint(), except it additionally takes an encoding.
Definition ctype.h:180
static bool rb_enc_isctype(OnigCodePoint c, OnigCtype t, rb_encoding *enc)
Queries if the passed code point is of passed character type in the passed encoding.
Definition ctype.h:63
VALUE rb_enc_sprintf(rb_encoding *enc, const char *fmt,...)
Identical to rb_sprintf(), except it additionally takes an encoding.
Definition sprintf.c:1200
static VALUE RB_OBJ_FROZEN_RAW(VALUE obj)
This is an implementation detail of RB_OBJ_FROZEN().
Definition fl_type.h:883
@ RUBY_FL_FREEZE
This flag has something to do with data immutability.
Definition fl_type.h:324
void rb_include_module(VALUE klass, VALUE module)
Includes a module to a class.
Definition class.c:1172
VALUE rb_define_class(const char *name, VALUE super)
Defines a top-level class.
Definition class.c:970
VALUE rb_define_module(const char *name)
Defines a top-level module.
Definition class.c:1080
void rb_define_alias(VALUE klass, const char *name1, const char *name2)
Defines an alias of a method.
Definition class.c:2331
void rb_undef_method(VALUE klass, const char *name)
Defines an undef of a method.
Definition class.c:2155
int rb_scan_args(int argc, const VALUE *argv, const char *fmt,...)
Retrieves argument from argc and argv to given VALUE references according to the format string.
Definition class.c:2621
int rb_block_given_p(void)
Determines if the current method is given a block.
Definition eval.c:866
int rb_get_kwargs(VALUE keyword_hash, const ID *table, int required, int optional, VALUE *values)
Keyword argument deconstructor.
Definition class.c:2410
#define TYPE(_)
Old name of rb_type.
Definition value_type.h:107
#define NEWOBJ_OF
Old name of RB_NEWOBJ_OF.
Definition newobj.h:61
#define ENCODING_SET_INLINED(obj, i)
Old name of RB_ENCODING_SET_INLINED.
Definition encoding.h:105
#define RB_INTEGER_TYPE_P
Old name of rb_integer_type_p.
Definition value_type.h:87
#define ENC_CODERANGE_7BIT
Old name of RUBY_ENC_CODERANGE_7BIT.
Definition coderange.h:180
#define ENC_CODERANGE_VALID
Old name of RUBY_ENC_CODERANGE_VALID.
Definition coderange.h:181
#define FL_UNSET_RAW
Old name of RB_FL_UNSET_RAW.
Definition fl_type.h:134
#define rb_str_buf_cat2
Old name of rb_usascii_str_new_cstr.
Definition string.h:1682
#define FL_EXIVAR
Old name of RUBY_FL_EXIVAR.
Definition fl_type.h:66
#define ALLOCV
Old name of RB_ALLOCV.
Definition memory.h:398
#define ISSPACE
Old name of rb_isspace.
Definition ctype.h:88
#define T_STRING
Old name of RUBY_T_STRING.
Definition value_type.h:78
#define ENC_CODERANGE_CLEAN_P(cr)
Old name of RB_ENC_CODERANGE_CLEAN_P.
Definition coderange.h:183
#define ENC_CODERANGE_AND(a, b)
Old name of RB_ENC_CODERANGE_AND.
Definition coderange.h:188
#define xfree
Old name of ruby_xfree.
Definition xmalloc.h:58
#define Qundef
Old name of RUBY_Qundef.
#define INT2FIX
Old name of RB_INT2FIX.
Definition long.h:48
#define OBJ_FROZEN
Old name of RB_OBJ_FROZEN.
Definition fl_type.h:137
#define rb_str_cat2
Old name of rb_str_cat_cstr.
Definition string.h:1683
#define UNREACHABLE
Old name of RBIMPL_UNREACHABLE.
Definition assume.h:28
#define ID2SYM
Old name of RB_ID2SYM.
Definition symbol.h:44
#define OBJ_FREEZE_RAW
Old name of RB_OBJ_FREEZE_RAW.
Definition fl_type.h:136
#define OBJ_FREEZE
Old name of RB_OBJ_FREEZE.
Definition fl_type.h:135
#define T_FIXNUM
Old name of RUBY_T_FIXNUM.
Definition value_type.h:63
#define UNREACHABLE_RETURN
Old name of RBIMPL_UNREACHABLE_RETURN.
Definition assume.h:29
#define SYM2ID
Old name of RB_SYM2ID.
Definition symbol.h:45
#define ENC_CODERANGE(obj)
Old name of RB_ENC_CODERANGE.
Definition coderange.h:184
#define CLASS_OF
Old name of rb_class_of.
Definition globals.h:203
#define ENC_CODERANGE_UNKNOWN
Old name of RUBY_ENC_CODERANGE_UNKNOWN.
Definition coderange.h:179
#define SIZET2NUM
Old name of RB_SIZE2NUM.
Definition size_t.h:62
#define FIXABLE
Old name of RB_FIXABLE.
Definition fixnum.h:25
#define xmalloc
Old name of ruby_xmalloc.
Definition xmalloc.h:53
#define ENCODING_GET(obj)
Old name of RB_ENCODING_GET.
Definition encoding.h:108
#define LONG2FIX
Old name of RB_INT2FIX.
Definition long.h:49
#define ISDIGIT
Old name of rb_isdigit.
Definition ctype.h:93
#define ENC_CODERANGE_MASK
Old name of RUBY_ENC_CODERANGE_MASK.
Definition coderange.h:178
#define ZALLOC_N
Old name of RB_ZALLOC_N.
Definition memory.h:395
#define ALLOC_N
Old name of RB_ALLOC_N.
Definition memory.h:393
#define MBCLEN_CHARFOUND_LEN(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_LEN.
Definition encoding.h:516
#define FL_TEST_RAW
Old name of RB_FL_TEST_RAW.
Definition fl_type.h:132
#define FL_SET
Old name of RB_FL_SET.
Definition fl_type.h:129
#define rb_ary_new3
Old name of rb_ary_new_from_args.
Definition array.h:652
#define ENCODING_INLINE_MAX
Old name of RUBY_ENCODING_INLINE_MAX.
Definition encoding.h:66
#define LONG2NUM
Old name of RB_LONG2NUM.
Definition long.h:50
#define ISALPHA
Old name of rb_isalpha.
Definition ctype.h:92
#define MBCLEN_INVALID_P(ret)
Old name of ONIGENC_MBCLEN_INVALID_P.
Definition encoding.h:517
#define ISASCII
Old name of rb_isascii.
Definition ctype.h:85
#define TOLOWER
Old name of rb_tolower.
Definition ctype.h:101
#define Qtrue
Old name of RUBY_Qtrue.
#define ST2FIX
Old name of RB_ST2FIX.
Definition st_data_t.h:33
#define MBCLEN_NEEDMORE_P(ret)
Old name of ONIGENC_MBCLEN_NEEDMORE_P.
Definition encoding.h:518
#define FIXNUM_MAX
Old name of RUBY_FIXNUM_MAX.
Definition fixnum.h:26
#define NUM2INT
Old name of RB_NUM2INT.
Definition int.h:44
#define Qnil
Old name of RUBY_Qnil.
#define Qfalse
Old name of RUBY_Qfalse.
#define FIX2LONG
Old name of RB_FIX2LONG.
Definition long.h:46
#define ENC_CODERANGE_BROKEN
Old name of RUBY_ENC_CODERANGE_BROKEN.
Definition coderange.h:182
#define scan_hex(s, l, e)
Old name of ruby_scan_hex.
Definition util.h:108
#define NIL_P
Old name of RB_NIL_P.
#define MBCLEN_CHARFOUND_P(ret)
Old name of ONIGENC_MBCLEN_CHARFOUND_P.
Definition encoding.h:515
#define FL_WB_PROTECTED
Old name of RUBY_FL_WB_PROTECTED.
Definition fl_type.h:59
#define DBL2NUM
Old name of rb_float_new.
Definition double.h:29
#define ISPRINT
Old name of rb_isprint.
Definition ctype.h:86
#define BUILTIN_TYPE
Old name of RB_BUILTIN_TYPE.
Definition value_type.h:85
#define ENCODING_SHIFT
Old name of RUBY_ENCODING_SHIFT.
Definition encoding.h:67
#define FL_TEST
Old name of RB_FL_TEST.
Definition fl_type.h:131
#define FL_FREEZE
Old name of RUBY_FL_FREEZE.
Definition fl_type.h:67
#define NUM2LONG
Old name of RB_NUM2LONG.
Definition long.h:51
#define ENCODING_GET_INLINED(obj)
Old name of RB_ENCODING_GET_INLINED.
Definition encoding.h:107
#define ENC_CODERANGE_CLEAR(obj)
Old name of RB_ENC_CODERANGE_CLEAR.
Definition coderange.h:187
#define FL_UNSET
Old name of RB_FL_UNSET.
Definition fl_type.h:133
#define UINT2NUM
Old name of RB_UINT2NUM.
Definition int.h:46
#define ENCODING_IS_ASCII8BIT(obj)
Old name of RB_ENCODING_IS_ASCII8BIT.
Definition encoding.h:109
#define FIXNUM_P
Old name of RB_FIXNUM_P.
#define CONST_ID
Old name of RUBY_CONST_ID.
Definition symbol.h:47
#define rb_ary_new2
Old name of rb_ary_new_capa.
Definition array.h:651
#define ENC_CODERANGE_SET(obj, cr)
Old name of RB_ENC_CODERANGE_SET.
Definition coderange.h:186
#define ENCODING_CODERANGE_SET(obj, encindex, cr)
Old name of RB_ENCODING_CODERANGE_SET.
Definition coderange.h:189
#define FL_SET_RAW
Old name of RB_FL_SET_RAW.
Definition fl_type.h:130
#define SYMBOL_P
Old name of RB_SYMBOL_P.
Definition value_type.h:88
#define OBJ_FROZEN_RAW
Old name of RB_OBJ_FROZEN_RAW.
Definition fl_type.h:138
#define T_REGEXP
Old name of RUBY_T_REGEXP.
Definition value_type.h:77
#define ENCODING_MASK
Old name of RUBY_ENCODING_MASK.
Definition encoding.h:68
void rb_category_warn(rb_warning_category_t category, const char *fmt,...)
Identical to rb_category_warning(), except it reports unless $VERBOSE is nil.
Definition error.c:433
void rb_syserr_fail(int e, const char *mesg)
Raises appropriate exception that represents a C errno.
Definition error.c:3567
VALUE rb_eRangeError
RangeError exception.
Definition error.c:1348
VALUE rb_eTypeError
TypeError exception.
Definition error.c:1344
VALUE rb_eEncCompatError
Encoding::CompatibilityError exception.
Definition error.c:1351
VALUE rb_eRuntimeError
RuntimeError exception.
Definition error.c:1342
VALUE rb_eIndexError
IndexError exception.
Definition error.c:1346
@ RB_WARN_CATEGORY_DEPRECATED
Warning is for deprecated features.
Definition error.h:48
VALUE rb_any_to_s(VALUE obj)
Generates a textual representation of the given object.
Definition object.c:625
VALUE rb_obj_alloc(VALUE klass)
Allocates an instance of the given class.
Definition object.c:2049
VALUE rb_class_new_instance_pass_kw(int argc, const VALUE *argv, VALUE klass)
Identical to rb_class_new_instance(), except it passes the passed keywords if any to the #initialize ...
Definition object.c:2067
VALUE rb_obj_frozen_p(VALUE obj)
Just calls RB_OBJ_FROZEN() inside.
Definition object.c:1228
double rb_str_to_dbl(VALUE str, int mode)
Identical to rb_cstr_to_dbl(), except it accepts a Ruby's string instead of C's.
Definition object.c:3422
VALUE rb_obj_class(VALUE obj)
Queries the class of an object.
Definition object.c:215
VALUE rb_obj_dup(VALUE obj)
Duplicates the given object.
Definition object.c:529
VALUE rb_cSymbol
Symbol class.
Definition string.c:79
VALUE rb_equal(VALUE lhs, VALUE rhs)
This function is an optimised version of calling #==.
Definition object.c:147
VALUE rb_mComparable
Comparable module.
Definition compar.c:19
VALUE rb_cString
String class.
Definition string.c:78
VALUE rb_to_int(VALUE val)
Identical to rb_check_to_int(), except it raises in case of conversion mismatch.
Definition object.c:3136
#define RB_OBJ_WRITE(old, slot, young)
Declaration of a "back" pointer.
Definition gc.h:619
Encoding relates APIs.
static char * rb_enc_left_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the left boundary of a character.
Definition encoding.h:682
static char * rb_enc_right_char_head(const char *s, const char *p, const char *e, rb_encoding *enc)
Queries the right boundary of a character.
Definition encoding.h:703
static unsigned int rb_enc_codepoint(const char *p, const char *e, rb_encoding *enc)
Queries the code point of character pointed by the passed pointer.
Definition encoding.h:570
static int rb_enc_mbmaxlen(rb_encoding *enc)
Queries the maximum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:446
static int RB_ENCODING_GET_INLINED(VALUE obj)
Queries the encoding of the passed object.
Definition encoding.h:98
static OnigCodePoint rb_enc_mbc_to_codepoint(const char *p, const char *e, rb_encoding *enc)
Identical to rb_enc_codepoint(), except it assumes the passed character is not broken.
Definition encoding.h:590
static int rb_enc_mbminlen(rb_encoding *enc)
Queries the minimum number of bytes that the passed encoding needs to represent a character.
Definition encoding.h:431
static int rb_enc_code_to_mbclen(int c, rb_encoding *enc)
Identical to rb_enc_codelen(), except it returns 0 for invalid code points.
Definition encoding.h:618
static char * rb_enc_step_back(const char *s, const char *p, const char *e, int n, rb_encoding *enc)
Scans the string backwards for n characters.
Definition encoding.h:725
VALUE rb_str_conv_enc(VALUE str, rb_encoding *from, rb_encoding *to)
Encoding conversion main routine.
Definition string.c:1149
int rb_enc_str_coderange(VALUE str)
Scans the passed string to collect its code range.
Definition string.c:769
VALUE rb_enc_str_new_static(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it takes a C string literal.
Definition string.c:1015
char * rb_enc_nth(const char *head, const char *tail, long nth, rb_encoding *enc)
Queries the n-th character.
Definition string.c:2730
VALUE rb_str_conv_enc_opts(VALUE str, rb_encoding *from, rb_encoding *to, int ecflags, VALUE ecopts)
Identical to rb_str_conv_enc(), except it additionally takes IO encoder options.
Definition string.c:1034
VALUE rb_enc_interned_str(const char *ptr, long len, rb_encoding *enc)
Identical to rb_enc_str_new(), except it returns a "f"string.
Definition string.c:12027
long rb_memsearch(const void *x, long m, const void *y, long n, rb_encoding *enc)
Looks for the passed string in the passed buffer.
Definition re.c:249
long rb_enc_strlen(const char *head, const char *tail, rb_encoding *enc)
Counts the number of characters of the passed string, according to the passed encoding.
Definition string.c:2074
VALUE rb_enc_str_new_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new(), except it assumes the passed pointer is a pointer to a C string.
Definition string.c:962
VALUE rb_str_export_to_enc(VALUE obj, rb_encoding *enc)
Identical to rb_str_export(), except it additionally takes an encoding.
Definition string.c:1254
VALUE rb_external_str_new_with_enc(const char *ptr, long len, rb_encoding *enc)
Identical to rb_external_str_new(), except it additionally takes an encoding.
Definition string.c:1155
int rb_enc_str_asciionly_p(VALUE str)
Queries if the passed string is "ASCII only".
Definition string.c:781
VALUE rb_enc_interned_str_cstr(const char *ptr, rb_encoding *enc)
Identical to rb_enc_str_new_cstr(), except it returns a "f"string.
Definition string.c:12038
long rb_str_coderange_scan_restartable(const char *str, const char *end, rb_encoding *enc, int *cr)
Scans the passed string until it finds something odd.
Definition string.c:653
int rb_enc_symname2_p(const char *name, long len, rb_encoding *enc)
Identical to rb_enc_symname_p(), except it additionally takes the passed string's length.
Definition symbol.c:414
rb_econv_result_t rb_econv_convert(rb_econv_t *ec, const unsigned char **source_buffer_ptr, const unsigned char *source_buffer_end, unsigned char **destination_buffer_ptr, unsigned char *destination_buffer_end, int flags)
Converts a string from an encoding to another.
Definition transcode.c:1475
rb_econv_result_t
return value of rb_econv_convert()
Definition transcode.h:30
@ econv_finished
The conversion stopped after converting everything.
Definition transcode.h:57
@ econv_destination_buffer_full
The conversion stopped because there is no destination.
Definition transcode.h:46
rb_econv_t * rb_econv_open_opts(const char *source_encoding, const char *destination_encoding, int ecflags, VALUE ecopts)
Identical to rb_econv_open(), except it additionally takes a hash of optional strings.
Definition transcode.c:2651
VALUE rb_str_encode(VALUE str, VALUE to, int ecflags, VALUE ecopts)
Converts the contents of the passed string from its encoding to the passed one.
Definition transcode.c:2914
void rb_econv_close(rb_econv_t *ec)
Destructs a converter.
Definition transcode.c:1731
VALUE rb_funcall(VALUE recv, ID mid, int n,...)
Calls a method.
Definition vm_eval.c:1121
VALUE rb_funcall_with_block_kw(VALUE recv, ID mid, int argc, const VALUE *argv, VALUE procval, int kw_splat)
Identical to rb_funcallv_with_block(), except you can specify how to handle the last element of the g...
Definition vm_eval.c:1208
#define RGENGC_WB_PROTECTED_STRING
This is a compile-time flag to enable/disable write barrier for struct RString.
Definition gc.h:495
#define RETURN_SIZED_ENUMERATOR(obj, argc, argv, size_fn)
This roughly resembles return enum_for(__callee__) unless block_given?.
Definition enumerator.h:206
#define RETURN_ENUMERATOR(obj, argc, argv)
Identical to RETURN_SIZED_ENUMERATOR(), except its size is unknown.
Definition enumerator.h:239
#define UNLIMITED_ARGUMENTS
This macro is used in conjunction with rb_check_arity().
Definition error.h:35
#define rb_check_frozen
Just another name of rb_check_frozen.
Definition error.h:264
static int rb_check_arity(int argc, int min, int max)
Ensures that the passed integer is in the passed range.
Definition error.h:280
VALUE rb_fs
The field separator character for inputs, or the $;.
Definition string.c:538
VALUE rb_backref_get(void)
Queries the last match, or Regexp.last_match, or the $~.
Definition vm.c:1793
VALUE rb_sym_all_symbols(void)
Collects every single bits of symbols that have ever interned in the entire history of the current pr...
Definition symbol.c:1017
void rb_backref_set(VALUE md)
Updates $~.
Definition vm.c:1799
VALUE rb_range_beg_len(VALUE range, long *begp, long *lenp, long len, int err)
Deconstructs a numerical range.
Definition range.c:1744
int rb_reg_backref_number(VALUE match, VALUE backref)
Queries the index of the given named capture.
Definition re.c:1232
int rb_reg_options(VALUE re)
Queries the options of the passed regular expression.
Definition re.c:4138
VALUE rb_reg_match(VALUE re, VALUE str)
This is the match operator.
Definition re.c:3635
void rb_match_busy(VALUE md)
Asserts that the given MatchData is "occupied".
Definition re.c:1438
VALUE rb_reg_nth_match(int n, VALUE md)
Queries the nth captured substring.
Definition re.c:1845
VALUE rb_str_to_interned_str(VALUE str)
Identical to rb_interned_str(), except it takes a Ruby's string instead of C's.
Definition string.c:12008
void rb_str_free(VALUE str)
Destroys the given string for no reason.
Definition string.c:1502
VALUE rb_str_new_shared(VALUE str)
Identical to rb_str_new_cstr(), except it takes a Ruby's string instead of C's.
Definition string.c:1318
VALUE rb_str_plus(VALUE lhs, VALUE rhs)
Generates a new string, concatenating the former to the latter.
Definition string.c:2225
#define rb_utf8_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "UTF-8" encoding.
Definition string.h:1583
#define rb_hash_end(h)
Just another name of st_hash_end.
Definition string.h:945
#define rb_hash_uint32(h, i)
Just another name of st_hash_uint32.
Definition string.h:939
VALUE rb_str_append(VALUE dst, VALUE src)
Identical to rb_str_buf_append(), except it converts the right hand side before concatenating.
Definition string.c:3382
VALUE rb_filesystem_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "filesystem" encoding.
Definition string.c:1230
VALUE rb_sym_to_s(VALUE sym)
This is an rb_sym2str() + rb_str_dup() combo.
Definition string.c:11659
VALUE rb_str_times(VALUE str, VALUE num)
Repetition of a string.
Definition string.c:2297
VALUE rb_external_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "default external" encoding.
Definition string.c:1206
VALUE rb_str_tmp_new(long len)
Allocates a "temporary" string.
Definition string.c:1496
long rb_str_offset(VALUE str, long pos)
"Inverse" of rb_str_sublen().
Definition string.c:2758
VALUE rb_str_succ(VALUE orig)
Searches for the "successor" of a string.
Definition string.c:4826
int rb_str_hash_cmp(VALUE str1, VALUE str2)
Compares two strings.
Definition string.c:3598
VALUE rb_str_ellipsize(VALUE str, long len)
Shortens str and adds three dots, an ellipsis, if it is longer than len characters.
Definition string.c:10952
st_index_t rb_memhash(const void *ptr, long len)
This is a universal hash function.
Definition random.c:1747
#define rb_str_new(str, len)
Allocates an instance of rb_cString.
Definition string.h:1498
void rb_str_shared_replace(VALUE dst, VALUE src)
Replaces the contents of the former with the latter.
Definition string.c:1549
#define rb_str_buf_cat
Just another name of rb_str_cat.
Definition string.h:1681
VALUE rb_str_new_static(const char *ptr, long len)
Identical to rb_str_new(), except it takes a C string literal.
Definition string.c:997
#define rb_usascii_str_new(str, len)
Identical to rb_str_new, except it generates a string of "US ASCII" encoding.
Definition string.h:1532
size_t rb_str_capacity(VALUE str)
Queries the capacity of the given string.
Definition string.c:815
st_index_t rb_str_hash(VALUE str)
Calculates a hash value of a string.
Definition string.c:3587
VALUE rb_str_locktmp(VALUE str)
Obtains a "temporary lock" of the string.
long rb_str_strlen(VALUE str)
Counts the number of characters (not bytes) that are stored inside of the given string.
Definition string.c:2163
VALUE rb_str_resurrect(VALUE str)
I guess there is no use case of this function in extension libraries, but this is a routine identical...
Definition string.c:1766
#define rb_str_buf_new_cstr(str)
Identical to rb_str_new_cstr, except done differently.
Definition string.h:1639
#define rb_usascii_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "US ASCII" encoding.
Definition string.h:1567
VALUE rb_str_replace(VALUE dst, VALUE src)
Replaces the contents of the former object with the stringised contents of the latter.
Definition string.c:6033
char * rb_str_subpos(VALUE str, long beg, long *len)
Identical to rb_str_substr(), except it returns a C's string instead of Ruby's.
Definition string.c:2863
rb_gvar_setter_t rb_str_setter
This is a rb_gvar_setter_t that refutes non-string assignments.
Definition string.h:1146
VALUE rb_interned_str_cstr(const char *ptr)
Identical to rb_interned_str(), except it assumes the passed pointer is a pointer to a C's string.
Definition string.c:12021
#define rb_external_str_new_cstr(str)
Identical to rb_str_new_cstr, except it generates a string of "default external" encoding.
Definition string.h:1604
long rb_str_sublen(VALUE str, long pos)
Byte offset to character offset conversion.
Definition string.c:2805
VALUE rb_str_equal(VALUE str1, VALUE str2)
Equality of two strings.
Definition string.c:3700
VALUE rb_str_inspect(VALUE str)
Generates a "readable" version of the receiver.
Definition string.c:6745
void rb_must_asciicompat(VALUE obj)
Asserts that the given string's encoding is (Ruby's definition of) ASCII compatible.
Definition string.c:2503
VALUE rb_interned_str(const char *ptr, long len)
Identical to rb_str_new(), except it returns an infamous "f"string.
Definition string.c:12014
int rb_str_cmp(VALUE lhs, VALUE rhs)
Compares two strings, as in strcmp(3).
Definition string.c:3654
VALUE rb_str_concat(VALUE dst, VALUE src)
Identical to rb_str_append(), except it also accepts an integer as a codepoint.
Definition string.c:3473
int rb_str_comparable(VALUE str1, VALUE str2)
Checks if two strings are comparable each other or not.
Definition string.c:3629
#define rb_strlen_lit(str)
Length of a string literal.
Definition string.h:1692
VALUE rb_str_buf_cat_ascii(VALUE dst, const char *src)
Identical to rb_str_cat_cstr(), except it additionally assumes the source string be a NUL terminated ...
Definition string.c:3324
VALUE rb_str_freeze(VALUE str)
This is the implementation of String#freeze.
Definition string.c:2972
void rb_str_update(VALUE dst, long beg, long len, VALUE src)
Replaces some (or all) of the contents of the given string.
Definition string.c:5336
VALUE rb_str_scrub(VALUE str, VALUE repl)
"Cleanses" the string.
Definition string.c:11010
#define rb_locale_str_new_cstr(str)
Identical to rb_external_str_new_cstr, except it generates a string of "locale" encoding instead of "...
Definition string.h:1625
VALUE rb_str_new_with_class(VALUE obj, const char *ptr, long len)
Identical to rb_str_new(), except it takes the class of the allocating object.
Definition string.c:1452
#define rb_str_dup_frozen
Just another name of rb_str_new_frozen.
Definition string.h:631
VALUE rb_check_string_type(VALUE obj)
Try converting an object to its stringised representation using its to_str method,...
Definition string.c:2654
VALUE rb_str_substr(VALUE str, long beg, long len)
This is the implementation of two-argumented String#slice.
Definition string.c:2950
#define rb_str_cat_cstr(buf, str)
Identical to rb_str_cat(), except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1656
VALUE rb_str_unlocktmp(VALUE str)
Releases a lock formerly obtained by rb_str_locktmp().
Definition string.c:3043
VALUE rb_utf8_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "UTF-8" encoding instead of "binary...
Definition string.c:1009
#define rb_utf8_str_new(str, len)
Identical to rb_str_new, except it generates a string of "UTF-8" encoding.
Definition string.h:1549
void rb_str_modify_expand(VALUE str, long capa)
Identical to rb_str_modify(), except it additionally expands the capacity of the receiver.
Definition string.c:2459
VALUE rb_str_dump(VALUE str)
"Inverse" of rb_eval_string().
Definition string.c:6859
VALUE rb_locale_str_new(const char *ptr, long len)
Identical to rb_str_new(), except it generates a string of "locale" encoding.
Definition string.c:1218
VALUE rb_str_length(VALUE)
Identical to rb_str_strlen(), except it returns the value in rb_cInteger.
Definition string.c:2177
#define rb_str_new_cstr(str)
Identical to rb_str_new, except it assumes the passed pointer is a pointer to a C string.
Definition string.h:1514
VALUE rb_str_drop_bytes(VALUE str, long len)
Shrinks the given string for the given number of bytes.
Definition string.c:5254
VALUE rb_str_split(VALUE str, const char *delim)
Divides the given string based on the given delimiter.
Definition string.c:8937
VALUE rb_usascii_str_new_static(const char *ptr, long len)
Identical to rb_str_new_static(), except it generates a string of "US ASCII" encoding instead of "bin...
Definition string.c:1003
int rb_respond_to(VALUE obj, ID mid)
Queries if the object responds to the method.
Definition vm_method.c:2822
void rb_undef_alloc_func(VALUE klass)
Deletes the allocator function of a class.
Definition vm_method.c:1159
void rb_define_alloc_func(VALUE klass, rb_alloc_func_t func)
Sets the allocator function of a class.
static ID rb_intern_const(const char *str)
This is a "tiny optimisation" over rb_intern().
Definition symbol.h:276
VALUE rb_sym2str(VALUE id)
Identical to rb_id2str(), except it takes an instance of rb_cSymbol rather than an ID.
Definition symbol.c:950
VALUE rb_to_symbol(VALUE name)
Identical to rb_intern_str(), except it generates a dynamic symbol if necessary.
Definition string.c:11981
ID rb_to_id(VALUE str)
Definition string.c:11971
int capa
Designed capacity of the buffer.
Definition io.h:11
int off
Offset inside of ptr.
Definition io.h:5
int len
Length of the buffer.
Definition io.h:8
long rb_reg_search(VALUE re, VALUE str, long pos, int dir)
Runs the passed regular expression over the passed string.
Definition re.c:1784
VALUE rb_reg_regcomp(VALUE str)
Creates a new instance of rb_cRegexp.
Definition re.c:3419
VALUE rb_reg_regsub(VALUE repl, VALUE src, struct re_registers *regs, VALUE rexp)
Substitution.
Definition re.c:4382
VALUE rb_str_format(int argc, const VALUE *argv, VALUE fmt)
Formats a string.
Definition sprintf.c:214
VALUE rb_yield(VALUE val)
Yields the block.
Definition vm_eval.c:1376
#define MEMCPY(p1, p2, type, n)
Handy macro to call memcpy.
Definition memory.h:366
#define ALLOCA_N(type, n)
Definition memory.h:286
#define MEMZERO(p, type, n)
Handy macro to erase a region of memory.
Definition memory.h:354
#define RB_GC_GUARD(v)
Prevents premature destruction of local objects.
Definition memory.h:161
void rb_define_hooked_variable(const char *q, VALUE *w, type *e, void_type *r)
Define a function-backended global variable.
VALUE rb_ensure(type *q, VALUE w, type *e, VALUE r)
An equivalent of ensure clause.
static int RARRAY_LENINT(VALUE ary)
Identical to rb_array_len(), except it differs for the return type.
Definition rarray.h:281
#define RARRAY_CONST_PTR
Just another name of rb_array_const_ptr.
Definition rarray.h:52
static VALUE RBASIC_CLASS(VALUE obj)
Queries the class of an object.
Definition rbasic.h:152
#define RBASIC(obj)
Convenient casting macro.
Definition rbasic.h:40
#define DATA_PTR(obj)
Convenient getter macro.
Definition rdata.h:71
static struct re_registers * RMATCH_REGS(VALUE match)
Queries the raw re_registers.
Definition rmatch.h:138
static VALUE RREGEXP_SRC(VALUE rexp)
Convenient getter function.
Definition rregexp.h:103
#define StringValue(v)
Ensures that the parameter object is a String.
Definition rstring.h:66
VALUE rb_str_export_locale(VALUE obj)
Identical to rb_str_export(), except it converts into the locale encoding instead.
Definition string.c:1248
char * rb_string_value_cstr(volatile VALUE *ptr)
Identical to rb_string_value_ptr(), except it additionally checks for the contents for viability as a...
Definition string.c:2631
static int RSTRING_LENINT(VALUE str)
Identical to RSTRING_LEN(), except it differs for the return type.
Definition rstring.h:468
#define RSTRING_GETMEM(str, ptrvar, lenvar)
Convenient macro to obtain the contents and length at once.
Definition rstring.h:488
VALUE rb_string_value(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it fills the passed pointer with the converted object.
Definition string.c:2515
#define RSTRING(obj)
Convenient casting macro.
Definition rstring.h:41
VALUE rb_str_export(VALUE obj)
Identical to rb_str_to_str(), except it additionally converts the string into default external encodi...
Definition string.c:1242
char * rb_string_value_ptr(volatile VALUE *ptr)
Identical to rb_str_to_str(), except it returns the converted string's backend memory region.
Definition string.c:2526
VALUE rb_str_to_str(VALUE obj)
Identical to rb_check_string_type(), except it raises exceptions in case of conversion failures.
Definition string.c:1540
#define StringValueCStr(v)
Identical to StringValuePtr, except it additionally checks for the contents for viability as a C stri...
Definition rstring.h:89
#define TypedData_Wrap_Struct(klass, data_type, sval)
Converts sval, a pointer to your struct, into a Ruby object.
Definition rtypeddata.h:449
VALUE rb_require(const char *feature)
Identical to rb_require_string(), except it takes C's string instead of Ruby's.
Definition load.c:1394
#define errno
Ractor-aware version of errno.
Definition ruby.h:388
#define RTEST
This is an old name of RB_TEST.
#define _(args)
This was a transition path from K&R to ANSI.
Definition stdarg.h:35
VALUE flags
Per-object flags.
Definition rbasic.h:77
Ruby's String.
Definition rstring.h:196
struct RBasic basic
Basic part, including flags and class.
Definition rstring.h:199
long capa
Capacity of *ptr.
Definition rstring.h:232
union RString::@51::@52::@54 aux
Auxiliary info.
long len
Length of the string, not including terminating NUL character.
Definition rstring.h:206
struct RString::@51::@53 embed
Embedded contents.
VALUE shared
Parent of the string.
Definition rstring.h:240
union RString::@51 as
String's specific fields.
char * ptr
Pointer to the contents of the string.
Definition rstring.h:222
struct RString::@51::@52 heap
Strings that use separated memory region for contents use this pattern.
This is the struct that holds necessary info for a struct.
Definition rtypeddata.h:200
Definition st.h:79
Definition string.c:7817
void rb_nativethread_lock_lock(rb_nativethread_lock_t *lock)
Blocks until the current thread obtains a lock.
Definition thread.c:293
uintptr_t ID
Type that represents a Ruby identifier such as a variable name.
Definition value.h:52
uintptr_t VALUE
Type that represents a Ruby object.
Definition value.h:40
static void Check_Type(VALUE v, enum ruby_value_type t)
Identical to RB_TYPE_P(), except it raises exceptions on predication failure.
Definition value_type.h:432