Fix a possible buffer overflow with ASCII regexes that really are

Unicode regexes. Obtained from: perl5-porters (Nicholas Clark), with modifications Approved by: portmgr (marcus)
author: Anton Berezin <tobez@FreeBSD.org> 2007-11-06 22:08:03 +0000
committer: Anton Berezin <tobez@FreeBSD.org> 2007-11-06 22:08:03 +0000
commit: bbc34ddf1100af44360c6c35dce71535bafa5a89 (patch)
tree: 4b0adecbac01f652e93707e150100970d586a53c /lang
parent: - irssi-devel should be using @dirrmtry in a few cases. this is (diff)
10 files changed, 480 insertions, 0 deletions
diff --git a/lang/perl5.10/Makefile b/lang/perl5.10/Makefile
index 13c8a8237609..3c607d2a42a9 100644
--- a/lang/perl5.10/Makefile
+++ b/lang/perl5.10/Makefile
@@ -7,6 +7,7 @@
 
 PORTNAME=	perl
 PORTVERSION=	${PERL_VER}
+PORTREVISION=	1
 CATEGORIES=	lang devel perl5
 MASTER_SITES=	${MASTER_SITE_PERL_CPAN} \
 		${MASTER_SITE_LOCAL:S/$/:local/} \
diff --git a/lang/perl5.10/files/patch-utf-regcomp b/lang/perl5.10/files/patch-utf-regcomp
new file mode 100644
index 000000000000..44c9a55de49a
--- /dev/null
+++ b/lang/perl5.10/files/patch-utf-regcomp
@@ -0,0 +1,95 @@
+--- regcomp.c.orig	2006-01-08 21:59:27.000000000 +0100
++++ regcomp.c	2007-11-06 22:48:26.000000000 +0100
+@@ -135,7 +135,10 @@ typedef struct RExC_state_t {
+     I32		extralen;
+     I32		seen_zerolen;
+     I32		seen_evals;
+-    I32		utf8;
++    I32		utf8;		/* whether the pattern is utf8 or not */
++    I32		orig_utf8;	/* whether the pattern was originally in utf8 */
++				/* XXX use this for future optimisation of case
++				 * where pattern must be upgraded to utf8. */
+ #if ADD_TO_REGEXEC
+     char 	*starttry;		/* -Dr: where regtry was called. */
+ #define RExC_starttry	(pRExC_state->starttry)
+@@ -161,6 +164,7 @@ typedef struct RExC_state_t {
+ #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
+ #define RExC_seen_evals	(pRExC_state->seen_evals)
+ #define RExC_utf8	(pRExC_state->utf8)
++#define RExC_orig_utf8	(pRExC_state->orig_utf8)
+ 
+ #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
+ #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
+@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
+     if (exp == NULL)
+ 	FAIL("NULL regexp argument");
+ 
+-    RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
++    RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+ 
+-    RExC_precomp = exp;
+     DEBUG_r({
+ 	 if (!PL_colorset) reginitcolors();
+ 	 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
+ 		       PL_colors[4],PL_colors[5],PL_colors[0],
+-		       (int)(xend - exp), RExC_precomp, PL_colors[1]);
++		       (int)(xend - exp), exp, PL_colors[1]);
+     });
++redo_first_pass:
++    RExC_precomp = exp;
+     RExC_flags = pm->op_pmflags;
+     RExC_sawback = 0;
+ 
+@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
+ 	RExC_precomp = Nullch;
+ 	return(NULL);
+     }
++    if (RExC_utf8 && !RExC_orig_utf8) {
++        /* It's possible to write a regexp in ascii that represents unicode
++        codepoints outside of the byte range, such as via \x{100}. If we
++        detect such a sequence we have to convert the entire pattern to utf8
++        and then recompile, as our sizing calculation will have been based
++        on 1 byte == 1 character, but we will need to use utf8 to encode
++        at least some part of the pattern, and therefore must convert the whole
++        thing.
++        XXX: somehow figure out how to make this less expensive...
++        -- dmq */
++        STRLEN len = xend-exp;
++        DEBUG_r(PerlIO_printf(Perl_debug_log,
++	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
++        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
++        xend = exp + len;
++        RExC_orig_utf8 = RExC_utf8;
++        SAVEFREEPV(exp);
++        goto redo_first_pass;
++    }
+     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
+ 
+     /* Small enough for pointer-storage convention?
+--- t/op/pat.t.orig	2006-01-07 13:53:32.000000000 +0100
++++ t/op/pat.t	2007-11-06 21:52:30.000000000 +0100
+@@ -6,7 +6,7 @@
+ 
+ $| = 1;
+ 
+-print "1..1187\n";
++print "1..1189\n";
+ 
+ BEGIN {
+     chdir 't' if -d 't';
+@@ -3395,5 +3395,14 @@ ok(("foba  ba$s" =~ qr/(foo|BaSS|bar)/i)
+        "# assigning to original string should not corrupt match vars");
+ }
+ 
+-# last test 1187
++{
++    use warnings;
++    my @w;
++    local $SIG{__WARN__}=sub{push @w,"@_"};
++    my $c=qq(\x{DF}); 
++    ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8");
++    ok(@w==0, "ASCII pattern that really is utf8");
++}    
++
++# last test 1189
+ 
diff --git a/lang/perl5.12/Makefile b/lang/perl5.12/Makefile
index 13c8a8237609..3c607d2a42a9 100644
--- a/lang/perl5.12/Makefile
+++ b/lang/perl5.12/Makefile
@@ -7,6 +7,7 @@
 
 PORTNAME=	perl
 PORTVERSION=	${PERL_VER}
+PORTREVISION=	1
 CATEGORIES=	lang devel perl5
 MASTER_SITES=	${MASTER_SITE_PERL_CPAN} \
 		${MASTER_SITE_LOCAL:S/$/:local/} \
diff --git a/lang/perl5.12/files/patch-utf-regcomp b/lang/perl5.12/files/patch-utf-regcomp
new file mode 100644
index 000000000000..44c9a55de49a
--- /dev/null
+++ b/lang/perl5.12/files/patch-utf-regcomp
@@ -0,0 +1,95 @@
+--- regcomp.c.orig	2006-01-08 21:59:27.000000000 +0100
++++ regcomp.c	2007-11-06 22:48:26.000000000 +0100
+@@ -135,7 +135,10 @@ typedef struct RExC_state_t {
+     I32		extralen;
+     I32		seen_zerolen;
+     I32		seen_evals;
+-    I32		utf8;
++    I32		utf8;		/* whether the pattern is utf8 or not */
++    I32		orig_utf8;	/* whether the pattern was originally in utf8 */
++				/* XXX use this for future optimisation of case
++				 * where pattern must be upgraded to utf8. */
+ #if ADD_TO_REGEXEC
+     char 	*starttry;		/* -Dr: where regtry was called. */
+ #define RExC_starttry	(pRExC_state->starttry)
+@@ -161,6 +164,7 @@ typedef struct RExC_state_t {
+ #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
+ #define RExC_seen_evals	(pRExC_state->seen_evals)
+ #define RExC_utf8	(pRExC_state->utf8)
++#define RExC_orig_utf8	(pRExC_state->orig_utf8)
+ 
+ #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
+ #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
+@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
+     if (exp == NULL)
+ 	FAIL("NULL regexp argument");
+ 
+-    RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
++    RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+ 
+-    RExC_precomp = exp;
+     DEBUG_r({
+ 	 if (!PL_colorset) reginitcolors();
+ 	 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
+ 		       PL_colors[4],PL_colors[5],PL_colors[0],
+-		       (int)(xend - exp), RExC_precomp, PL_colors[1]);
++		       (int)(xend - exp), exp, PL_colors[1]);
+     });
++redo_first_pass:
++    RExC_precomp = exp;
+     RExC_flags = pm->op_pmflags;
+     RExC_sawback = 0;
+ 
+@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
+ 	RExC_precomp = Nullch;
+ 	return(NULL);
+     }
++    if (RExC_utf8 && !RExC_orig_utf8) {
++        /* It's possible to write a regexp in ascii that represents unicode
++        codepoints outside of the byte range, such as via \x{100}. If we
++        detect such a sequence we have to convert the entire pattern to utf8
++        and then recompile, as our sizing calculation will have been based
++        on 1 byte == 1 character, but we will need to use utf8 to encode
++        at least some part of the pattern, and therefore must convert the whole
++        thing.
++        XXX: somehow figure out how to make this less expensive...
++        -- dmq */
++        STRLEN len = xend-exp;
++        DEBUG_r(PerlIO_printf(Perl_debug_log,
++	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
++        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
++        xend = exp + len;
++        RExC_orig_utf8 = RExC_utf8;
++        SAVEFREEPV(exp);
++        goto redo_first_pass;
++    }
+     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
+ 
+     /* Small enough for pointer-storage convention?
+--- t/op/pat.t.orig	2006-01-07 13:53:32.000000000 +0100
++++ t/op/pat.t	2007-11-06 21:52:30.000000000 +0100
+@@ -6,7 +6,7 @@
+ 
+ $| = 1;
+ 
+-print "1..1187\n";
++print "1..1189\n";
+ 
+ BEGIN {
+     chdir 't' if -d 't';
+@@ -3395,5 +3395,14 @@ ok(("foba  ba$s" =~ qr/(foo|BaSS|bar)/i)
+        "# assigning to original string should not corrupt match vars");
+ }
+ 
+-# last test 1187
++{
++    use warnings;
++    my @w;
++    local $SIG{__WARN__}=sub{push @w,"@_"};
++    my $c=qq(\x{DF}); 
++    ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8");
++    ok(@w==0, "ASCII pattern that really is utf8");
++}    
++
++# last test 1189
+ 
diff --git a/lang/perl5.14/Makefile b/lang/perl5.14/Makefile
index 13c8a8237609..3c607d2a42a9 100644
--- a/lang/perl5.14/Makefile
+++ b/lang/perl5.14/Makefile
@@ -7,6 +7,7 @@
 
 PORTNAME=	perl
 PORTVERSION=	${PERL_VER}
+PORTREVISION=	1
 CATEGORIES=	lang devel perl5
 MASTER_SITES=	${MASTER_SITE_PERL_CPAN} \
 		${MASTER_SITE_LOCAL:S/$/:local/} \
diff --git a/lang/perl5.14/files/patch-utf-regcomp b/lang/perl5.14/files/patch-utf-regcomp
new file mode 100644
index 000000000000..44c9a55de49a
--- /dev/null
+++ b/lang/perl5.14/files/patch-utf-regcomp
@@ -0,0 +1,95 @@
+--- regcomp.c.orig	2006-01-08 21:59:27.000000000 +0100
++++ regcomp.c	2007-11-06 22:48:26.000000000 +0100
+@@ -135,7 +135,10 @@ typedef struct RExC_state_t {
+     I32		extralen;
+     I32		seen_zerolen;
+     I32		seen_evals;
+-    I32		utf8;
++    I32		utf8;		/* whether the pattern is utf8 or not */
++    I32		orig_utf8;	/* whether the pattern was originally in utf8 */
++				/* XXX use this for future optimisation of case
++				 * where pattern must be upgraded to utf8. */
+ #if ADD_TO_REGEXEC
+     char 	*starttry;		/* -Dr: where regtry was called. */
+ #define RExC_starttry	(pRExC_state->starttry)
+@@ -161,6 +164,7 @@ typedef struct RExC_state_t {
+ #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
+ #define RExC_seen_evals	(pRExC_state->seen_evals)
+ #define RExC_utf8	(pRExC_state->utf8)
++#define RExC_orig_utf8	(pRExC_state->orig_utf8)
+ 
+ #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
+ #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
+@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
+     if (exp == NULL)
+ 	FAIL("NULL regexp argument");
+ 
+-    RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
++    RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+ 
+-    RExC_precomp = exp;
+     DEBUG_r({
+ 	 if (!PL_colorset) reginitcolors();
+ 	 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
+ 		       PL_colors[4],PL_colors[5],PL_colors[0],
+-		       (int)(xend - exp), RExC_precomp, PL_colors[1]);
++		       (int)(xend - exp), exp, PL_colors[1]);
+     });
++redo_first_pass:
++    RExC_precomp = exp;
+     RExC_flags = pm->op_pmflags;
+     RExC_sawback = 0;
+ 
+@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
+ 	RExC_precomp = Nullch;
+ 	return(NULL);
+     }
++    if (RExC_utf8 && !RExC_orig_utf8) {
++        /* It's possible to write a regexp in ascii that represents unicode
++        codepoints outside of the byte range, such as via \x{100}. If we
++        detect such a sequence we have to convert the entire pattern to utf8
++        and then recompile, as our sizing calculation will have been based
++        on 1 byte == 1 character, but we will need to use utf8 to encode
++        at least some part of the pattern, and therefore must convert the whole
++        thing.
++        XXX: somehow figure out how to make this less expensive...
++        -- dmq */
++        STRLEN len = xend-exp;
++        DEBUG_r(PerlIO_printf(Perl_debug_log,
++	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
++        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
++        xend = exp + len;
++        RExC_orig_utf8 = RExC_utf8;
++        SAVEFREEPV(exp);
++        goto redo_first_pass;
++    }
+     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
+ 
+     /* Small enough for pointer-storage convention?
+--- t/op/pat.t.orig	2006-01-07 13:53:32.000000000 +0100
++++ t/op/pat.t	2007-11-06 21:52:30.000000000 +0100
+@@ -6,7 +6,7 @@
+ 
+ $| = 1;
+ 
+-print "1..1187\n";
++print "1..1189\n";
+ 
+ BEGIN {
+     chdir 't' if -d 't';
+@@ -3395,5 +3395,14 @@ ok(("foba  ba$s" =~ qr/(foo|BaSS|bar)/i)
+        "# assigning to original string should not corrupt match vars");
+ }
+ 
+-# last test 1187
++{
++    use warnings;
++    my @w;
++    local $SIG{__WARN__}=sub{push @w,"@_"};
++    my $c=qq(\x{DF}); 
++    ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8");
++    ok(@w==0, "ASCII pattern that really is utf8");
++}    
++
++# last test 1189
+ 
diff --git a/lang/perl5.16/Makefile b/lang/perl5.16/Makefile
index 13c8a8237609..3c607d2a42a9 100644
--- a/lang/perl5.16/Makefile
+++ b/lang/perl5.16/Makefile
@@ -7,6 +7,7 @@
 
 PORTNAME=	perl
 PORTVERSION=	${PERL_VER}
+PORTREVISION=	1
 CATEGORIES=	lang devel perl5
 MASTER_SITES=	${MASTER_SITE_PERL_CPAN} \
 		${MASTER_SITE_LOCAL:S/$/:local/} \
diff --git a/lang/perl5.16/files/patch-utf-regcomp b/lang/perl5.16/files/patch-utf-regcomp
new file mode 100644
index 000000000000..44c9a55de49a
--- /dev/null
+++ b/lang/perl5.16/files/patch-utf-regcomp
@@ -0,0 +1,95 @@
+--- regcomp.c.orig	2006-01-08 21:59:27.000000000 +0100
++++ regcomp.c	2007-11-06 22:48:26.000000000 +0100
+@@ -135,7 +135,10 @@ typedef struct RExC_state_t {
+     I32		extralen;
+     I32		seen_zerolen;
+     I32		seen_evals;
+-    I32		utf8;
++    I32		utf8;		/* whether the pattern is utf8 or not */
++    I32		orig_utf8;	/* whether the pattern was originally in utf8 */
++				/* XXX use this for future optimisation of case
++				 * where pattern must be upgraded to utf8. */
+ #if ADD_TO_REGEXEC
+     char 	*starttry;		/* -Dr: where regtry was called. */
+ #define RExC_starttry	(pRExC_state->starttry)
+@@ -161,6 +164,7 @@ typedef struct RExC_state_t {
+ #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
+ #define RExC_seen_evals	(pRExC_state->seen_evals)
+ #define RExC_utf8	(pRExC_state->utf8)
++#define RExC_orig_utf8	(pRExC_state->orig_utf8)
+ 
+ #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
+ #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
+@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
+     if (exp == NULL)
+ 	FAIL("NULL regexp argument");
+ 
+-    RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
++    RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+ 
+-    RExC_precomp = exp;
+     DEBUG_r({
+ 	 if (!PL_colorset) reginitcolors();
+ 	 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
+ 		       PL_colors[4],PL_colors[5],PL_colors[0],
+-		       (int)(xend - exp), RExC_precomp, PL_colors[1]);
++		       (int)(xend - exp), exp, PL_colors[1]);
+     });
++redo_first_pass:
++    RExC_precomp = exp;
+     RExC_flags = pm->op_pmflags;
+     RExC_sawback = 0;
+ 
+@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
+ 	RExC_precomp = Nullch;
+ 	return(NULL);
+     }
++    if (RExC_utf8 && !RExC_orig_utf8) {
++        /* It's possible to write a regexp in ascii that represents unicode
++        codepoints outside of the byte range, such as via \x{100}. If we
++        detect such a sequence we have to convert the entire pattern to utf8
++        and then recompile, as our sizing calculation will have been based
++        on 1 byte == 1 character, but we will need to use utf8 to encode
++        at least some part of the pattern, and therefore must convert the whole
++        thing.
++        XXX: somehow figure out how to make this less expensive...
++        -- dmq */
++        STRLEN len = xend-exp;
++        DEBUG_r(PerlIO_printf(Perl_debug_log,
++	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
++        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
++        xend = exp + len;
++        RExC_orig_utf8 = RExC_utf8;
++        SAVEFREEPV(exp);
++        goto redo_first_pass;
++    }
+     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
+ 
+     /* Small enough for pointer-storage convention?
+--- t/op/pat.t.orig	2006-01-07 13:53:32.000000000 +0100
++++ t/op/pat.t	2007-11-06 21:52:30.000000000 +0100
+@@ -6,7 +6,7 @@
+ 
+ $| = 1;
+ 
+-print "1..1187\n";
++print "1..1189\n";
+ 
+ BEGIN {
+     chdir 't' if -d 't';
+@@ -3395,5 +3395,14 @@ ok(("foba  ba$s" =~ qr/(foo|BaSS|bar)/i)
+        "# assigning to original string should not corrupt match vars");
+ }
+ 
+-# last test 1187
++{
++    use warnings;
++    my @w;
++    local $SIG{__WARN__}=sub{push @w,"@_"};
++    my $c=qq(\x{DF}); 
++    ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8");
++    ok(@w==0, "ASCII pattern that really is utf8");
++}    
++
++# last test 1189
+ 
diff --git a/lang/perl5.8/Makefile b/lang/perl5.8/Makefile
index 13c8a8237609..3c607d2a42a9 100644
--- a/lang/perl5.8/Makefile
+++ b/lang/perl5.8/Makefile
@@ -7,6 +7,7 @@
 
 PORTNAME=	perl
 PORTVERSION=	${PERL_VER}
+PORTREVISION=	1
 CATEGORIES=	lang devel perl5
 MASTER_SITES=	${MASTER_SITE_PERL_CPAN} \
 		${MASTER_SITE_LOCAL:S/$/:local/} \
diff --git a/lang/perl5.8/files/patch-utf-regcomp b/lang/perl5.8/files/patch-utf-regcomp
new file mode 100644
index 000000000000..44c9a55de49a
--- /dev/null
+++ b/lang/perl5.8/files/patch-utf-regcomp
@@ -0,0 +1,95 @@
+--- regcomp.c.orig	2006-01-08 21:59:27.000000000 +0100
++++ regcomp.c	2007-11-06 22:48:26.000000000 +0100
+@@ -135,7 +135,10 @@ typedef struct RExC_state_t {
+     I32		extralen;
+     I32		seen_zerolen;
+     I32		seen_evals;
+-    I32		utf8;
++    I32		utf8;		/* whether the pattern is utf8 or not */
++    I32		orig_utf8;	/* whether the pattern was originally in utf8 */
++				/* XXX use this for future optimisation of case
++				 * where pattern must be upgraded to utf8. */
+ #if ADD_TO_REGEXEC
+     char 	*starttry;		/* -Dr: where regtry was called. */
+ #define RExC_starttry	(pRExC_state->starttry)
+@@ -161,6 +164,7 @@ typedef struct RExC_state_t {
+ #define RExC_seen_zerolen	(pRExC_state->seen_zerolen)
+ #define RExC_seen_evals	(pRExC_state->seen_evals)
+ #define RExC_utf8	(pRExC_state->utf8)
++#define RExC_orig_utf8	(pRExC_state->orig_utf8)
+ 
+ #define	ISMULT1(c)	((c) == '*' || (c) == '+' || (c) == '?')
+ #define	ISMULT2(s)	((*s) == '*' || (*s) == '+' || (*s) == '?' || \
+@@ -1749,15 +1753,16 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
+     if (exp == NULL)
+ 	FAIL("NULL regexp argument");
+ 
+-    RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
++    RExC_utf8 = RExC_orig_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
+ 
+-    RExC_precomp = exp;
+     DEBUG_r({
+ 	 if (!PL_colorset) reginitcolors();
+ 	 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s `%s%*s%s'\n",
+ 		       PL_colors[4],PL_colors[5],PL_colors[0],
+-		       (int)(xend - exp), RExC_precomp, PL_colors[1]);
++		       (int)(xend - exp), exp, PL_colors[1]);
+     });
++redo_first_pass:
++    RExC_precomp = exp;
+     RExC_flags = pm->op_pmflags;
+     RExC_sawback = 0;
+ 
+@@ -1783,6 +1788,25 @@ Perl_pregcomp(pTHX_ char *exp, char *xen
+ 	RExC_precomp = Nullch;
+ 	return(NULL);
+     }
++    if (RExC_utf8 && !RExC_orig_utf8) {
++        /* It's possible to write a regexp in ascii that represents unicode
++        codepoints outside of the byte range, such as via \x{100}. If we
++        detect such a sequence we have to convert the entire pattern to utf8
++        and then recompile, as our sizing calculation will have been based
++        on 1 byte == 1 character, but we will need to use utf8 to encode
++        at least some part of the pattern, and therefore must convert the whole
++        thing.
++        XXX: somehow figure out how to make this less expensive...
++        -- dmq */
++        STRLEN len = xend-exp;
++        DEBUG_r(PerlIO_printf(Perl_debug_log,
++	    "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
++        exp = (char*)Perl_bytes_to_utf8(aTHX_ (U8*)exp, &len);
++        xend = exp + len;
++        RExC_orig_utf8 = RExC_utf8;
++        SAVEFREEPV(exp);
++        goto redo_first_pass;
++    }
+     DEBUG_r(PerlIO_printf(Perl_debug_log, "size %"IVdf" ", (IV)RExC_size));
+ 
+     /* Small enough for pointer-storage convention?
+--- t/op/pat.t.orig	2006-01-07 13:53:32.000000000 +0100
++++ t/op/pat.t	2007-11-06 21:52:30.000000000 +0100
+@@ -6,7 +6,7 @@
+ 
+ $| = 1;
+ 
+-print "1..1187\n";
++print "1..1189\n";
+ 
+ BEGIN {
+     chdir 't' if -d 't';
+@@ -3395,5 +3395,14 @@ ok(("foba  ba$s" =~ qr/(foo|BaSS|bar)/i)
+        "# assigning to original string should not corrupt match vars");
+ }
+ 
+-# last test 1187
++{
++    use warnings;
++    my @w;
++    local $SIG{__WARN__}=sub{push @w,"@_"};
++    my $c=qq(\x{DF}); 
++    ok($c=~/${c}|\x{100}/, "ASCII pattern that really is utf8");
++    ok(@w==0, "ASCII pattern that really is utf8");
++}    
++
++# last test 1189
+
author	Anton Berezin <tobez@FreeBSD.org>	2007-11-06 22:08:03 +0000
committer	Anton Berezin <tobez@FreeBSD.org>	2007-11-06 22:08:03 +0000
commit	bbc34ddf1100af44360c6c35dce71535bafa5a89 (patch)
tree	4b0adecbac01f652e93707e150100970d586a53c /lang
parent	- irssi-devel should be using @dirrmtry in a few cases. this is (diff)