Index: CHANGES =================================================================== --- CHANGES (revision 598272) +++ CHANGES (working copy) @@ -1,6 +1,10 @@ -*- coding: utf-8 -*- Changes with Apache 2.2.7 + *) mod_substitute: Added a new experimental output filter, which + performs inline response content pattern matching (including + regex) and substitution. [Jim Jagielski, Ruediger Pluem] + *) rotatelogs: Change command-line parsing to report more types of errors. Allow local timestamps to be used when rotating based on file size. [Jeff Trawick] Index: modules/filters/config.m4 =================================================================== --- modules/filters/config.m4 (revision 598264) +++ modules/filters/config.m4 (working copy) @@ -7,6 +7,7 @@ APACHE_MODULE(ext_filter, external filter module, , , most) APACHE_MODULE(include, Server Side Includes, , , yes) APACHE_MODULE(filter, Smart Filtering, , , yes) +APACHE_MODULE(substitute, response content rewrite-like filtering, , , most) if test "$ac_cv_ebcdic" = "yes"; then # mod_charset_lite can be very useful on an ebcdic system, Index: modules/filters/NWGNUmakefile =================================================================== --- modules/filters/NWGNUmakefile (revision 598264) +++ modules/filters/NWGNUmakefile (working copy) @@ -155,6 +155,7 @@ $(OBJDIR)/extfiltr.nlm \ $(OBJDIR)/charsetl.nlm \ $(OBJDIR)/mod_filter.nlm \ + $(OBJDIR)/substitute.nlm \ $(EOLIST) # If the zlib libraries source exists then build the mod_deflate module --- /dev/null 2007-11-26 10:25:07.000000000 -0500 +++ docs/manual/mod/mod_substitute.xml 2007-11-26 10:37:56.000000000 -0500 @@ -0,0 +1,90 @@ + + + + + + + + + +mod_substitute +Perform search and replace operations on response bodies +Extension +mod_substitute.c +substitute_module + + +

mod_substitute provides a mechanism to perform + both regular expression and fixed string substitutions on + response bodies.

+

This is an experimental module and should + be used with care.

+
+ + +Substitute +Pattern to filter the response content +Substitute s/pattern/substitution/[inf] +directory +.htaccess +FileInfo + + +

The Substitute directive specifies a + search and replace pattern to apply to the response body.

+ +

The meaning of the pattern can be modified by using any + combination of these flags:

+ +
+
i
+
Perform a case-insensitive match.
+
n
+
By default the pattern is treated as a regular expression. + Using the n flag forces the pattern to be treated + as a fixed string.
+
f
+
The f flag causes mod_substitute to flatten the + result of a substitution allowing for later substitutions to + take place on the boundary of this one.
+
+ + Example + <Location /> + + AddOutputFilterByType SUBSTITUTE text/html
+ Substitute s/foo/bar/ni
+
+ </Location> +
+ +

If either the pattern or the substitution contain a slash + character then an alternative delimiter should be used:

+ + Example of using an alternate delimiter + <Location /> + + AddOutputFilterByType SUBSTITUTE text/html
+ Substitute "s|<BR */?>|<br />|i" +
+ </Location> +
+
+
+ +
--- /dev/null 2007-11-26 10:25:07.000000000 -0500 +++ docs/manual/mod/mod_substitute.xml.meta 2007-11-26 10:32:46.000000000 -0500 @@ -0,0 +1,11 @@ + + + + mod_substitute + /mod/ + .. + + + en + + --- /dev/null 2007-12-08 13:52:08.000000000 -0500 +++ modules/filters/mod_substitute.c 2007-12-08 13:54:18.000000000 -0500 @@ -0,0 +1,584 @@ +/* Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * mod_substitute.c: Perform content rewriting on the fly + */ + +#include "httpd.h" +#include "http_config.h" +#include "http_core.h" +#include "apr_general.h" +#include "apr_strings.h" +#include "apr_strmatch.h" +#include "apr_lib.h" +#include "util_filter.h" +#include "apr_buckets.h" +#include "http_request.h" +#define APR_WANT_STRFUNC +#include "apr_want.h" + +static const char substitute_filter_name[] = "SUBSTITUTE"; + +module AP_MODULE_DECLARE_DATA substitute_module; + +typedef struct subst_pattern_t { + const apr_strmatch_pattern *pattern; + const ap_regex_t *regexp; + const char *replacement; + apr_size_t replen; + apr_size_t patlen; + int flatten; +} subst_pattern_t; + +typedef struct { + apr_array_header_t *patterns; +} subst_dir_conf; + +typedef struct { + apr_bucket_brigade *linebb; + apr_bucket_brigade *linesbb; + apr_bucket_brigade *passbb; + apr_bucket_brigade *pattbb; + apr_pool_t *tpool; +} substitute_module_ctx; + +static void *create_substitute_dcfg(apr_pool_t *p, char *d) +{ + subst_dir_conf *dcfg = + (subst_dir_conf *) apr_pcalloc(p, sizeof(subst_dir_conf)); + + dcfg->patterns = apr_array_make(p, 10, sizeof(subst_pattern_t)); + return dcfg; +} + +static void *merge_substitute_dcfg(apr_pool_t *p, void *basev, void *overv) +{ + subst_dir_conf *a = + (subst_dir_conf *) apr_pcalloc(p, sizeof(subst_dir_conf)); + subst_dir_conf *base = (subst_dir_conf *) basev; + subst_dir_conf *over = (subst_dir_conf *) overv; + + a->patterns = apr_array_append(p, over->patterns, + base->patterns); + return a; +} + +#define AP_MAX_BUCKETS 1000 + +#define SEDSCAT(s1, s2, pool, buff, blen, repl) do { \ + if (!s1) { \ + s1 = apr_pstrmemdup(pool, buff, blen); \ + } \ + else { \ + s2 = apr_pstrmemdup(pool, buff, blen); \ + s1 = apr_pstrcat(pool, s1, s2, NULL); \ + } \ + s1 = apr_pstrcat(pool, s1, repl, NULL); \ +} while (0) + +#define SEDRMPATBCKT(b, offset, tmp_b, patlen) do { \ + apr_bucket_split(b, offset); \ + tmp_b = APR_BUCKET_NEXT(b); \ + apr_bucket_split(tmp_b, patlen); \ + b = APR_BUCKET_NEXT(tmp_b); \ + apr_bucket_delete(tmp_b); \ +} while (0) + +static void do_pattmatch(ap_filter_t *f, apr_bucket *inb, + apr_bucket_brigade *mybb, + apr_pool_t *tmp_pool) +{ + int i; + ap_regmatch_t regm[AP_MAX_REG_MATCH]; + apr_size_t bytes; + apr_size_t len; + apr_size_t fbytes; + const char *buff; + const char *repl; + char *scratch; + char *p; + char *s1; + char *s2; + apr_bucket *b; + apr_bucket *tmp_b; + apr_pool_t *tpool; + + subst_dir_conf *cfg = + (subst_dir_conf *) ap_get_module_config(f->r->per_dir_config, + &substitute_module); + subst_pattern_t *script; + + APR_BRIGADE_INSERT_TAIL(mybb, inb); + + script = (subst_pattern_t *) cfg->patterns->elts; + apr_pool_create(&tpool, tmp_pool); + scratch = NULL; + fbytes = 0; + for (i = 0; i < cfg->patterns->nelts; i++) { + for (b = APR_BRIGADE_FIRST(mybb); + b != APR_BRIGADE_SENTINEL(mybb); + b = APR_BUCKET_NEXT(b)) { + if (APR_BUCKET_IS_METADATA(b)) { + /* + * we should NEVER see this, because we should never + * be passed any, but "handle" it just in case. + */ + continue; + } + if (apr_bucket_read(b, &buff, &bytes, APR_BLOCK_READ) + == APR_SUCCESS) { + s1 = NULL; + if (script->pattern) { + while ((repl = apr_strmatch(script->pattern, buff, bytes))) + { + /* get offset into buff for pattern */ + len = (apr_size_t) (repl - buff); + if (script->flatten) { + /* + * We are flattening the buckets here, meaning + * that we don't do the fast bucket splits. + * Instead we copy over what the buckets would + * contain and use them. This is slow, since we + * are constanting allocing space and copying + * strings. + */ + SEDSCAT(s1, s2, tmp_pool, buff, len, + script->replacement); + } + else { + /* + * We now split off the stuff before the regex + * as its own bucket, then isolate the pattern + * and delete it. + */ + SEDRMPATBCKT(b, len, tmp_b, script->patlen); + /* + * Finally, we create a bucket that contains the + * replacement... + */ + tmp_b = apr_bucket_transient_create(script->replacement, + script->replen, + f->r->connection->bucket_alloc); + /* ... and insert it */ + APR_BUCKET_INSERT_BEFORE(b, tmp_b); + } + /* now we need to adjust buff for all these changes */ + len += script->patlen; + bytes -= len; + buff += len; + } + if (script->flatten && s1) { + /* + * we've finished looking at the bucket, so remove the + * old one and add in our new one + */ + s2 = apr_pstrmemdup(tmp_pool, buff, bytes); + s1 = apr_pstrcat(tmp_pool, s1, s2, NULL); + tmp_b = apr_bucket_transient_create(s1, strlen(s1), + f->r->connection->bucket_alloc); + APR_BUCKET_INSERT_BEFORE(b, tmp_b); + tmp_b = APR_BUCKET_NEXT(b); + apr_bucket_delete(b); + b = tmp_b; + } + + } + else if (script->regexp) { + /* + * we need a null terminated string here :(. To hopefully + * save time and memory, we don't alloc for each run + * through, but only if we need to have a larger chunk + * to save the string to. So we keep track of how much + * we've allocated and only re-alloc when we need it. + * NOTE: this screams for a macro. + */ + if (!scratch || (bytes > (fbytes + 1))) { + fbytes = bytes + 1; + scratch = apr_palloc(tpool, fbytes); + } + /* reset pointer to the scratch space */ + p = scratch; + memcpy(p, buff, bytes); + p[bytes] = '\0'; + while (!ap_regexec(script->regexp, p, + AP_MAX_REG_MATCH, regm, 0)) { + /* first, grab the replacement string */ + repl = ap_pregsub(tmp_pool, script->replacement, p, + AP_MAX_REG_MATCH, regm); + if (script->flatten) { + SEDSCAT(s1, s2, tmp_pool, p, regm[0].rm_so, repl); + } + else { + len = (apr_size_t) (regm[0].rm_eo - regm[0].rm_so); + SEDRMPATBCKT(b, regm[0].rm_so, tmp_b, len); + tmp_b = apr_bucket_transient_create(repl, + strlen(repl), + f->r->connection->bucket_alloc); + APR_BUCKET_INSERT_BEFORE(b, tmp_b); + } + /* + * reset to past what we just did. buff now maps to b + * again + */ + p += regm[0].rm_eo; + } + if (script->flatten && s1) { + s1 = apr_pstrcat(tmp_pool, s1, p, NULL); + tmp_b = apr_bucket_transient_create(s1, strlen(s1), + f->r->connection->bucket_alloc); + APR_BUCKET_INSERT_BEFORE(b, tmp_b); + tmp_b = APR_BUCKET_NEXT(b); + apr_bucket_delete(b); + b = tmp_b; + } + + } + else { + /* huh? */ + continue; + } + } + } + script++; + } + + apr_pool_destroy(tpool); + + return; +} + +static apr_status_t substitute_filter(ap_filter_t *f, apr_bucket_brigade *bb) +{ + apr_size_t bytes; + apr_size_t len; + apr_size_t fbytes; + const char *buff; + const char *nl = NULL; + char *bflat; + apr_bucket *b; + apr_bucket *tmp_b; + apr_bucket_brigade *tmp_bb = NULL; + apr_status_t rv; + + substitute_module_ctx *ctx = f->ctx; + + /* + * First time around? Create the saved bb that we used for each pass + * through. Note that we can also get here when we explicitly clear ctx, + * for error handling + */ + if (!ctx) { + f->ctx = ctx = apr_pcalloc(f->r->pool, sizeof(*ctx)); + /* + * Create all the temporary brigades we need and reuse them to avoid + * creating them over and over again from r->pool which would cost a + * lot of memory in some cases. + */ + ctx->linebb = apr_brigade_create(f->r->pool, f->c->bucket_alloc); + ctx->linesbb = apr_brigade_create(f->r->pool, f->c->bucket_alloc); + ctx->pattbb = apr_brigade_create(f->r->pool, f->c->bucket_alloc); + /* + * Everything to be passed to the next filter goes in + * here, our pass brigade. + */ + ctx->passbb = apr_brigade_create(f->r->pool, f->c->bucket_alloc); + /* Create our temporary pool only once */ + apr_pool_create(&(ctx->tpool), f->r->pool); + apr_table_unset(f->r->headers_out, "Content-Length"); + } + + /* + * Shortcircuit processing + */ + if (APR_BRIGADE_EMPTY(bb)) + return APR_SUCCESS; + + /* + * Here's the concept: + * Read in the data and look for newlines. Once we + * find a full "line", add it to our working brigade. + * If we've finished reading the brigade and we have + * any left over data (not a "full" line), store that + * for the next pass. + * + * Note: anything stored in ctx->linebb for sure does not have + * a newline char, so we don't concat that bb with the + * new bb, since we would spending time searching for the newline + * in data we know it doesn't exist. So instead, we simply scan + * our current bb and, if we see a newline, prepend ctx->linebb + * to the front of it. This makes the code much less straight- + * forward (otherwise we could APR_BRIGADE_CONCAT(ctx->linebb, bb) + * and just scan for newlines and not bother with needing to know + * when ctx->linebb needs to be reset) but also faster. We'll take + * the speed. + * + * Note: apr_brigade_split_line would be nice here, but we + * really can't use it since we need more control and we want + * to re-use already read bucket data. + * + * See mod_include if still confused :) + */ + + while ((b = APR_BRIGADE_FIRST(bb)) && (b != APR_BRIGADE_SENTINEL(bb))) { + if (APR_BUCKET_IS_EOS(b)) { + /* + * if we see the EOS, then we need to pass along everything we + * have. But if the ctx->linebb isn't empty, then we need to add + * that to the end of what we'll be passing. + */ + if (!APR_BRIGADE_EMPTY(ctx->linebb)) { + rv = apr_brigade_pflatten(ctx->linebb, &bflat, + &fbytes, ctx->tpool); + tmp_b = apr_bucket_transient_create(bflat, fbytes, + f->r->connection->bucket_alloc); + do_pattmatch(f, tmp_b, ctx->pattbb, ctx->tpool); + APR_BRIGADE_CONCAT(ctx->passbb, ctx->pattbb); + } + apr_brigade_cleanup(ctx->linebb); + APR_BUCKET_REMOVE(b); + APR_BRIGADE_INSERT_TAIL(ctx->passbb, b); + } + /* + * No need to handle FLUSH buckets separately as we call + * ap_pass_brigade anyway at the end of the loop. + */ + else if (APR_BUCKET_IS_METADATA(b)) { + APR_BUCKET_REMOVE(b); + APR_BRIGADE_INSERT_TAIL(ctx->passbb, b); + } + else { + /* + * We have actual "data" so read in as much as we can and start + * scanning and splitting from our read buffer + */ + rv = apr_bucket_read(b, &buff, &bytes, APR_BLOCK_READ); + if (rv != APR_SUCCESS || bytes == 0) { + APR_BUCKET_REMOVE(b); + } + else { + int num = 0; + while (bytes > 0) { + nl = memchr(buff, APR_ASCII_LF, bytes); + if (nl) { + len = (apr_size_t) (nl - buff) + 1; + /* split *after* the newline */ + apr_bucket_split(b, len); + /* + * We've likely read more data, so bypass rereading + * bucket data and continue scanning through this + * buffer + */ + bytes -= len; + buff += len; + /* + * we need b to be updated for future potential + * splitting + */ + tmp_b = APR_BUCKET_NEXT(b); + APR_BUCKET_REMOVE(b); + /* + * Hey, we found a newline! Don't forget the old + * stuff that needs to be added to the front. So we + * add the split bucket to the end, flatten the whole + * bb, morph the whole shebang into a bucket which is + * then added to the tail of the newline bb. + */ + if (!APR_BRIGADE_EMPTY(ctx->linebb)) { + APR_BRIGADE_INSERT_TAIL(ctx->linebb, b); + rv = apr_brigade_pflatten(ctx->linebb, &bflat, + &fbytes, ctx->tpool); + b = apr_bucket_transient_create(bflat, fbytes, + f->r->connection->bucket_alloc); + apr_brigade_cleanup(ctx->linebb); + } + do_pattmatch(f, b, ctx->pattbb, ctx->tpool); + /* + * Count how many buckets we have in ctx->passbb + * so far. Yes, this is correct we count ctx->passbb + * and not ctx->pattbb as we do not reset num on every + * iteration. + */ + for (b = APR_BRIGADE_FIRST(ctx->pattbb); + b != APR_BRIGADE_SENTINEL(ctx->pattbb); + b = APR_BUCKET_NEXT(b)) { + num++; + } + APR_BRIGADE_CONCAT(ctx->passbb, ctx->pattbb); + /* + * If the number of buckets in ctx->passbb reaches an + * "insane" level, we consume much memory for all the + * buckets as such. So lets flush them down the chain + * in this case and thus clear ctx->passbb. This frees + * the buckets memory for further processing. + * Usually this condition should not become true, but + * it is a safety measure for edge cases. + */ + if (num > AP_MAX_BUCKETS) { + b = apr_bucket_flush_create( + f->r->connection->bucket_alloc); + APR_BRIGADE_INSERT_TAIL(ctx->passbb, b); + rv = ap_pass_brigade(f->next, ctx->passbb); + apr_brigade_cleanup(ctx->passbb); + num = 0; + apr_pool_clear(ctx->tpool); + if (rv != APR_SUCCESS) + return rv; + } + b = tmp_b; + } + else { + /* + * no newline in whatever is left of this buffer so + * tuck data away and get next bucket + */ + APR_BUCKET_REMOVE(b); + APR_BRIGADE_INSERT_TAIL(ctx->linebb, b); + bytes = 0; + } + } + } + } + if (!APR_BRIGADE_EMPTY(ctx->passbb)) { + rv = ap_pass_brigade(f->next, ctx->passbb); + apr_brigade_cleanup(ctx->passbb); + if (rv != APR_SUCCESS) { + apr_pool_clear(ctx->tpool); + return rv; + } + } + apr_pool_clear(ctx->tpool); + } + + /* Anything left we want to save/setaside for the next go-around */ + if (!APR_BRIGADE_EMPTY(ctx->linebb)) { + /* + * Provide ap_save_brigade with an existing empty brigade + * (ctx->linesbb) to avoid creating a new one. + */ + ap_save_brigade(f, &(ctx->linesbb), &(ctx->linebb), f->r->pool); + tmp_bb = ctx->linebb; + ctx->linebb = ctx->linesbb; + ctx->linesbb = tmp_bb; + } + + return APR_SUCCESS; +} + +static const char *set_pattern(cmd_parms *cmd, void *cfg, const char *line) +{ + char *from = NULL; + char *to = NULL; + char *flags = NULL; + char *ourline; + char delim; + subst_pattern_t *nscript; + int is_pattern = 0; + int ignore_case = 0; + int flatten = 0; + ap_regex_t *r = NULL; + + if (apr_tolower(*line) != 's') { + return "Bad Substitute format, must be an s/// pattern"; + } + ourline = apr_pstrdup(cmd->pool, line); + delim = *++ourline; + if (delim) + from = ++ourline; + if (from) { + while (*++ourline && *ourline != delim); + if (*ourline) { + *ourline = '\0'; + to = ++ourline; + } + } + if (to) { + while (*++ourline && *ourline != delim); + if (*ourline) { + *ourline = '\0'; + flags = ++ourline; + } + } + + if (!delim || !from || !to) { + return "Bad Substitute format, must be a complete s/// pattern"; + } + + while (*flags) { + delim = apr_tolower(*flags); /* re-use */ + if (delim == 'i') + ignore_case = 1; + else if (delim == 'n') + is_pattern = 1; + else if (delim == 'f') + flatten = 1; + else + return "Bad Substitute flag, only s///[inf] are supported"; + flags++; + } + + /* first see if we can compile the regex */ + if (!is_pattern) { + r = ap_pregcomp(cmd->pool, from, AP_REG_EXTENDED | + (ignore_case ? AP_REG_ICASE : 0)); + if (!r) + return "Substitute could not compile regex"; + } + nscript = apr_array_push(((subst_dir_conf *) cfg)->patterns); + /* init the new entries */ + nscript->pattern = NULL; + nscript->regexp = NULL; + nscript->replacement = NULL; + nscript->patlen = 0; + + if (is_pattern) { + nscript->patlen = strlen(from); + nscript->pattern = apr_strmatch_precompile(cmd->pool, from, + !ignore_case); + } + else { + nscript->regexp = r; + } + + nscript->replacement = to; + nscript->replen = strlen(to); + nscript->flatten = flatten; + + return NULL; +} + +#define PROTO_FLAGS AP_FILTER_PROTO_CHANGE|AP_FILTER_PROTO_CHANGE_LENGTH +static void register_hooks(apr_pool_t *pool) +{ + ap_register_output_filter(substitute_filter_name, substitute_filter, + NULL, AP_FTYPE_RESOURCE); +} + +static const command_rec substitute_cmds[] = { + AP_INIT_TAKE1("Substitute", set_pattern, NULL, OR_ALL, + "Pattern to filter the response content (s/foo/bar/[inf])"), + {NULL} +}; + +module AP_MODULE_DECLARE_DATA substitute_module = { + STANDARD20_MODULE_STUFF, + create_substitute_dcfg, /* dir config creater */ + merge_substitute_dcfg, /* dir merger --- default is to override */ + NULL, /* server config */ + NULL, /* merge server config */ + substitute_cmds, /* command table */ + register_hooks /* register hooks */ +};