diff --git a/NEWS.md b/NEWS.md index 256c7450a..f2d994bf1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -26,6 +26,8 @@ 3. Vignettes are now built using `litedown` instead of `knitr`, [#6394](https://github.com/Rdatatable/data.table/issues/6394). Thanks @jangorecki for the suggestion and @ben-schwen and @aitap for the implementation. +4. Removed use of non-API macros `ATTRIB`, `SET_ATTRIB`, [#6180](https://github.com/Rdatatable/data.table/issues/6180). Thanks @aitap for the continued assiduous work here. + ### BUG FIXES 1. `fread()` with `skip=0` and `(header=TRUE|FALSE)` no longer skips the first row when it has fewer fields than subsequent rows, [#7463](https://github.com/Rdatatable/data.table/issues/7463). Thanks @emayerhofer for the report and @ben-schwen for the fix. diff --git a/src/assign.c b/src/assign.c index 849cb08f2..4cc8bccb1 100644 --- a/src/assign.c +++ b/src/assign.c @@ -256,6 +256,103 @@ SEXP selfrefokwrapper(SEXP x, SEXP verbose) { return ScalarInteger(_selfrefok(x,FALSE,LOGICAL(verbose)[0])); } +struct attrib_name_ctx { + hashtab *indexNames; // stores a 1 for every CHARSXP index name in use, 0 for removed + R_xlen_t indexNamesLen; // how much memory to allocate for the hash? + SEXP index; // attr(DT, "index") + SEXP assignedNames; // STRSXP vector of variable names just assigned + bool verbose; +}; + +// Mark each CHARSXP attribute name with a 1 inside the hash, or count them to find out the allocation size. +static SEXP getOneAttribName(SEXP key, SEXP val, void *ctx_) { + (void)val; + struct attrib_name_ctx *ctx = ctx_; + if (ctx->indexNames) + hash_set(ctx->indexNames, PRINTNAME(key), 1); + else + ctx->indexNamesLen++; + return NULL; +} + +// For a given index, find out if it sorts a column that has just been assigned. If so, shorten the index (if an equivalent one doesn't already exist) or remove it altogether. +static SEXP fixIndexAttrib(SEXP tag, SEXP value, void *ctx_) { + const struct attrib_name_ctx *ctx = ctx_; + + hashtab *indexNames = ctx->indexNames; + SEXP index = ctx->index, assignedNames = ctx->assignedNames; + R_xlen_t indexLength = xlength(value); + bool verbose = ctx->verbose; + + const char *tc1, *c1; + tc1 = c1 = CHAR(PRINTNAME(tag)); // the index name; e.g. "__col1__col2" + + if (*tc1!='_' || *(tc1+1)!='_') { + // fix for #1396 + if (verbose) { + Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1); + } + setAttrib(index, tag, R_NilValue); + return NULL; + } + + tc1 += 2; // tc1 always marks the start of a key column + if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov + + void *vmax = vmaxget(); + // check the position of the first appearance of an assigned column in the index. + // the new index will be truncated to this position. + size_t newKeyLength = strlen(c1); + char *s4 = R_alloc(newKeyLength + 3, 1); + memcpy(s4, c1, newKeyLength); + memcpy(s4 + newKeyLength, "__", 3); + + for(int i = 0; i < xlength(assignedNames); i++){ + const char *tc2 = CHAR(STRING_ELT(assignedNames, i)); + void *vmax2 = vmaxget(); + size_t tc2_len = strlen(tc2); + char *s5 = R_alloc(tc2_len + 5, 1); //4 * '_' + \0 + memcpy(s5, "__", 2); + memcpy(s5 + 2, tc2, tc2_len); + memcpy(s5 + 2 + tc2_len, "__", 3); + tc2 = strstr(s4, s5); + if(tc2 && (tc2 - s4 < newKeyLength)){ // new column is part of key; match is before last match + newKeyLength = tc2 - s4; + } + vmaxset(vmax2); + } + + s4[newKeyLength] = '\0'; // truncate the new key to the new length + if(newKeyLength == 0){ // no valid key column remains. Drop the key + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, PRINTNAME(tag), 0); + if (verbose) { + Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); + } + } else if(newKeyLength < strlen(c1)) { + SEXP s4Str = PROTECT(mkChar(s4)); + if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) + !hash_lookup(indexNames, s4Str, 0)) { // index with shortened name not present yet + setAttrib(index, installChar(s4Str), value); + hash_set(indexNames, PRINTNAME(tag), 0); + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, s4Str, 1); + if (verbose) + Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4+2); + } else { // indexLength > 0 || shortened name present already + // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) + // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. + setAttrib(index, tag, R_NilValue); + hash_set(indexNames, PRINTNAME(tag), 0); + if (verbose) + Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); + } + UNPROTECT(1); // s4Str + } //else: index is not affected by assign: nothing to be done + vmaxset(vmax); + return NULL; +} + int *_Last_updated = NULL; SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) @@ -264,12 +361,12 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) // newcolnames : add these columns (if any) // cols : column names or numbers corresponding to the values to set // rows : row numbers to assign - R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum, indexLength; - SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames, indexNames; + R_len_t numToDo, targetlen, vlen, oldncol, oldtncol, coln, protecti=0, newcolnum; + SEXP targetcol, nullint, s, colnam, tmp, key, index, a, assignedNames; bool verbose=GetVerbose(); int ndelete=0; // how many columns are being deleted const char *c1, *tc1, *tc2; - int *buf, indexNo; + int *buf; if (isNull(dt)) error(_("assign has been passed a NULL dt")); if (TYPEOF(dt) != VECSXP) error(_("dt passed to assign isn't type VECSXP")); if (islocked(dt)) @@ -549,93 +646,17 @@ SEXP assign(SEXP dt, SEXP rows, SEXP cols, SEXP newcolnames, SEXP values) } index = getAttrib(dt, install("index")); if (index != R_NilValue) { - s = ATTRIB(index); - indexNo = 0; - // get a vector with all index names - PROTECT(indexNames = allocVector(STRSXP, xlength(s))); protecti++; - while(s != R_NilValue){ - SET_STRING_ELT(indexNames, indexNo, PRINTNAME(TAG(s))); - indexNo++; - s = CDR(s); - } - s = ATTRIB(index); // reset to first element - indexNo = 0; - while(s != R_NilValue) { - a = TAG(s); - indexLength = xlength(CAR(s)); - tc1 = c1 = CHAR(PRINTNAME(a)); // the index name; e.g. "__col1__col2" - if (*tc1!='_' || *(tc1+1)!='_') { - // fix for #1396 - if (verbose) { - Rprintf(_("Dropping index '%s' as it doesn't have '__' at the beginning of its name. It was very likely created by v1.9.4 of data.table.\n"), tc1); - } - setAttrib(index, a, R_NilValue); - indexNo++; - s = CDR(s); - continue; // with next index - } - tc1 += 2; // tc1 always marks the start of a key column - if (!*tc1) internal_error(__func__, "index name ends with trailing __"); // # nocov - // check the position of the first appearance of an assigned column in the index. - // the new index will be truncated to this position. - char *s4 = malloc(strlen(c1) + 3); - if (!s4) { - internal_error(__func__, "Couldn't allocate memory for s4"); // # nocov - } - memcpy(s4, c1, strlen(c1)); - memset(s4 + strlen(c1), '\0', 1); - strcat(s4, "__"); // add trailing '__' to newKey so we can search for pattern '__colName__' also at the end of the index. - int newKeyLength = strlen(c1); - for(int i = 0; i < xlength(assignedNames); i++){ - tc2 = CHAR(STRING_ELT(assignedNames, i)); - char *s5 = malloc(strlen(tc2) + 5); //4 * '_' + \0 - if (!s5) { - free(s4); // # nocov - internal_error(__func__, "Couldn't allocate memory for s5"); // # nocov - } - memset(s5, '_', 2); - memset(s5 + 2, '\0', 1); - strcat(s5, tc2); - strcat(s5, "__"); - tc2 = strstr(s4, s5); - if(tc2 == NULL){ // column is not part of key - free(s5); - continue; - } - if(tc2 - s4 < newKeyLength){ // new column match is before last match - newKeyLength = tc2 - s4; - } - free(s5); - } - memset(s4 + newKeyLength, '\0', 1); // truncate the new key to the new length - if(newKeyLength == 0){ // no valid key column remains. Drop the key - setAttrib(index, a, R_NilValue); - SET_STRING_ELT(indexNames, indexNo, NA_STRING); - if (verbose) { - Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); - } - } else if(newKeyLength < strlen(c1)) { - SEXP s4Str = PROTECT(mkString(s4)); - if(indexLength == 0 && // shortened index can be kept since it is just information on the order (see #2372) - LOGICAL(chin(s4Str, indexNames))[0] == 0) {// index with shortened name not present yet - SET_TAG(s, install(s4)); - SET_STRING_ELT(indexNames, indexNo, mkChar(s4)); - if (verbose) - Rprintf(_("Shortening index '%s' to '%s' due to an update on a key column\n"), c1+2, s4 + 2); - } else { // indexLength > 0 || shortened name present already - // indexLength > 0 indicates reordering. Drop it to avoid spurious reordering in non-indexed columns (#2372) - // shortened name already present indicates that index needs to be dropped to avoid duplicate indices. - setAttrib(index, a, R_NilValue); - SET_STRING_ELT(indexNames, indexNo, NA_STRING); - if (verbose) - Rprintf(_("Dropping index '%s' due to an update on a key column\n"), c1+2); - } - UNPROTECT(1); // s4Str - } //else: index is not affected by assign: nothing to be done - free(s4); - indexNo ++; - s = CDR(s); - } + struct attrib_name_ctx ctx = { 0, }; + R_mapAttrib(index, getOneAttribName, &ctx); // how many attributes? + hashtab *h = hash_create(ctx.indexNamesLen); + PROTECT(h->prot); + ctx.indexNames = h; + R_mapAttrib(index, getOneAttribName, &ctx); // now remember the names + ctx.index = index; + ctx.assignedNames = assignedNames; + ctx.verbose = verbose; + R_mapAttrib(index, fixIndexAttrib, &ctx); // adjust indices as needed + UNPROTECT(1); // h } if (ndelete) { // delete any columns assigned NULL (there was a 'continue' earlier in loop above) diff --git a/src/data.table.h b/src/data.table.h index 434d0a340..a7f787258 100644 --- a/src/data.table.h +++ b/src/data.table.h @@ -15,6 +15,8 @@ #endif #if R_VERSION < R_Version(4, 5, 0) # define isDataFrame(x) isFrame(x) // #6180 +# define CLEAR_ATTRIB(x) SET_ATTRIB(x, R_NilValue) +# define ANY_ATTRIB(x) (!(isNull(ATTRIB(x)))) #endif #include #define SEXPPTR_RO(x) ((const SEXP *)DATAPTR_RO(x)) // to avoid overhead of looped STRING_ELT and VECTOR_ELT @@ -103,6 +105,11 @@ } # define R_resizeVector(x, newlen) R_resizeVector_(x, newlen) #endif +// TODO(R>=4.6.0): remove the SVN revision check +#if R_VERSION < R_Version(4, 6, 0) || R_SVN_REVISION < 89194 +# define BACKPORT_MAP_ATTRIB +# define R_mapAttrib(x, fun, ctx) R_mapAttrib_(x, fun, ctx) +#endif // init.c extern SEXP char_integer64; @@ -343,6 +350,9 @@ SEXP R_allocResizableVector_(SEXPTYPE type, R_xlen_t maxlen); SEXP R_duplicateAsResizable_(SEXP x); void R_resizeVector_(SEXP x, R_xlen_t newlen); #endif +#ifdef BACKPORT_MAP_ATTRIB +SEXP R_mapAttrib_(SEXP x, SEXP (*fun)(SEXP key, SEXP val, void *ctx), void *ctx); +#endif SEXP is_direct_child(SEXP pids); // types.c diff --git a/src/dogroups.c b/src/dogroups.c index d5b9f01be..773f40b99 100644 --- a/src/dogroups.c +++ b/src/dogroups.c @@ -3,6 +3,8 @@ #include #include +static SEXP anySpecialAttribute(SEXP key, SEXP val, void *ctx); + static bool anySpecialStatic(SEXP x, hashtab * specials) { // Special refers to special symbols .BY, .I, .N, and .GRP; see special-symbols.Rd // Static because these are like C static arrays which are the same memory for each group; e.g., dogroups @@ -39,7 +41,7 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) { // with PR#4164 started to copy input list columns too much. Hence PR#4655 in v1.13.2 moved that copy here just where it is needed. // Currently the marker is negative truelength. These specials are protected by us here and before we release them // we restore the true truelength for when R starts to use vector truelength. - SEXP attribs, list_el; + SEXP list_el; const int n = length(x); // use length() not LENGTH() because isNewList() is true for NULL if (n==0) @@ -53,20 +55,29 @@ static bool anySpecialStatic(SEXP x, hashtab * specials) { list_el = VECTOR_ELT(x,i); if (anySpecialStatic(list_el, specials)) return true; - for(attribs = ATTRIB(list_el); attribs != R_NilValue; attribs = CDR(attribs)) { - if (anySpecialStatic(CAR(attribs), specials)) - return true; // #4936 - } + if (R_mapAttrib(list_el, anySpecialAttribute, specials)) + return true; // #4936 } } return false; } +static SEXP anySpecialAttribute(SEXP key, SEXP val, void *specials) { + (void)key; + return anySpecialStatic(val, specials) ? R_NilValue : NULL; +} + +static SEXP findRowNames(SEXP key, SEXP val, void *data) { + (void)data; + if (key == R_RowNamesSymbol) return val; + return NULL; +} + SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEXP xjiscols, SEXP grporder, SEXP order, SEXP starts, SEXP lens, SEXP jexp, SEXP env, SEXP lhs, SEXP newnames, SEXP on, SEXP verboseArg, SEXP showProgressArg) { R_len_t ngrp, nrowgroups, njval=0, ngrpcols, ansloc=0, maxn, estn=-1, thisansloc, grpn, thislen, igrp; int nprotect=0; - SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, rownames, s, RHS, target, source; + SEXP ans=NULL, jval, thiscol, BY, N, I, GRP, iSD, xSD, s, RHS, target, source; Rboolean wasvector, firstalloc=FALSE, NullWarnDone=FALSE; const bool verbose = LOGICAL(verboseArg)[0]==1; double tstart=0, tblock[10]={0}; int nblock[10]={0}; // For verbose printing, tstart is updated each block @@ -130,11 +141,11 @@ SEXP dogroups(SEXP dt, SEXP dtcols, SEXP groups, SEXP grpcols, SEXP jiscols, SEX R_LockBinding(install(".I"), env); SEXP dtnames = PROTECT(getAttrib(dt, R_NamesSymbol)); nprotect++; // added here to fix #91 - `:=` did not issue recycling warning during "by" - // fetch rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to + + // override rownames of .SD. rownames[1] is set to -thislen for each group, in case .SD is passed to // non data.table aware package that uses rownames - for (s = ATTRIB(SD); s != R_NilValue && TAG(s)!=R_RowNamesSymbol; s = CDR(s)); // getAttrib0 basically but that's hidden in attrib.c; #loop_counter_not_local_scope_ok - if (s==R_NilValue) error(_("row.names attribute of .SD not found")); - rownames = CAR(s); + SEXP rownames = PROTECT(R_mapAttrib(SD, findRowNames, NULL)); nprotect++; + if (rownames == NULL) error(_("row.names attribute of .SD not found")); if (!isInteger(rownames) || LENGTH(rownames)!=2 || INTEGER(rownames)[0]!=NA_INTEGER) error(_("row.names of .SD isn't integer length 2 with NA as first item; i.e., .set_row_names(). [%s %d %d]"),type2char(TYPEOF(rownames)),LENGTH(rownames),INTEGER(rownames)[0]); // fetch names of .SD and prepare symbols. In case they are copied-on-write by user assigning to those variables diff --git a/src/mergelist.c b/src/mergelist.c index 51f28d224..90854ae82 100644 --- a/src/mergelist.c +++ b/src/mergelist.c @@ -17,18 +17,21 @@ SEXP copyCols(SEXP x, SEXP cols) { return R_NilValue; } +static SEXP setDuplicateOneAttrib(SEXP key, SEXP val, void *x) { + setAttrib(x, PROTECT(key), PROTECT(shallow_duplicate(val))); + UNPROTECT(2); + return NULL; // continue +} + void mergeIndexAttrib(SEXP to, SEXP from) { if (!isInteger(to) || LENGTH(to)!=0) internal_error(__func__, "'to' must be integer() already"); // # nocov if (isNull(from)) return; - SEXP t = ATTRIB(to), f = ATTRIB(from); - if (isNull(t)) // target has no attributes -> overwrite - SET_ATTRIB(to, shallow_duplicate(f)); - else { - for (t = ATTRIB(to); CDR(t) != R_NilValue; t = CDR(t)); // traverse to end of attributes list of to - SETCDR(t, shallow_duplicate(f)); - } + if (!ANY_ATTRIB(to)) // target has no attributes -> overwrite + SHALLOW_DUPLICATE_ATTRIB(to, from); + else + R_mapAttrib(from, setDuplicateOneAttrib, to); } SEXP cbindlist(SEXP x, SEXP copyArg) { @@ -84,7 +87,7 @@ SEXP cbindlist(SEXP x, SEXP copyArg) { key = getAttrib(thisx, sym_sorted); UNPROTECT(protecti); // thisnames, thisxcol } - if (isNull(ATTRIB(index))) + if (!ANY_ATTRIB(index)) setAttrib(ans, sym_index, R_NilValue); setAttrib(ans, R_NamesSymbol, names); setAttrib(ans, sym_sorted, key); diff --git a/src/nafill.c b/src/nafill.c index 4187523c5..67bd034c6 100644 --- a/src/nafill.c +++ b/src/nafill.c @@ -218,7 +218,7 @@ SEXP nafillR(SEXP obj, SEXP type, SEXP fill, SEXP nan_is_na_arg, SEXP inplace, S if (!binplace) { for (R_len_t i=0; i