You never reset matches
to NULL
after free()
ing it before the next round in the loop. Consequently it still retains the original value after the first round of reallocs.
Either set it to NULL
prior to entering the inner processing loop (redundant on the first pass), or set it back to NULL
immediately after the free(matches)
. Alternatively you could simply make it local to the encompassing scope of the next-outer loop with an initial NULL
value as you have it, but the changes previously mentioned are the most minimal I can think of.
Examples
matches = NULL; // HERE
while((pairs = pcre_exec(f, f_ext, page, sizeof(page), a, PCRE_NOTEMPTY, vector, vecsize)) >=0)
{
pcre_get_substring(page, vector, pairs, 0, &buff);
//printf("%s\n", buff);
more_matches = realloc(matches, (count+1)* sizeof(*more_matches));
if (more_matches!=NULL)
{
matches=more_matches;
matches[count++]=buff;
}
else
{
free(matches);
puts("Error (re)allocating memory");
exit(1);
}
a = vector[1] + 1;
}
Or....
for (i = 0; i < uniques_len; i++)
{
printf("%s\n", uniques[i]);
}
free(matches);
matches = NULL; // or HERE
pcre_free(f);
More Stuff
Continuing down the path of things I've noticed:
This:
ssize_t ret;
ret = read(fd, page, sizeof(page));
if (ret > 0) {
page[ret] = '\0';
appears to be trying to set a null char terminator. if so, you're invoking undefined behavior on a full-populated buffer. It should be this:
ssize_t ret = read(fd, page, sizeof(page)-1); // NOTE SPACE FOR TERM
if (ret > 0) {
page[ret] = 0;
If the size of the buffer is specific (you chose 4K for a reason) it should be 4097 to ensure a max-exact 4K buffer.
And another...
You're reading the page, which I cannot claim is or is not requiring to be terminated as I showed in the code before. But assuming it is and you did I I suggested (or.. not), this also looks wrong:
while((pairs = pcre_exec(f, f_ext, page, sizeof(page), a, PCRE_NOTEMPTY, vector, vecsize)) >=0)
Here you're passing the size of the entire buffer; not the size of the actual data you read. I'm the first person to tell you I'm unfamiliar with the API, but I'm fairly sure this should be:
// notice the length of the buffer passed, ret
while((pairs = pcre_exec(f, f_ext, page, ret, a, PCRE_NOTEMPTY, vector, vecsize)) >=0)
In other words, on an undersized read you're telling it the data is longer than it really is. Again, I'm naive to their API, but this seems reasonable.
Of Unique Matches...
Hopefully easier to read.
int matches_len = count, uniques_len = 0;
int i = 0, j = 0;
const char *uniques[matches_len];
for (i=0; i < matches_len; ++i)
{
for (j = 0; j < uniques_len; ++j)
{
if (!strcmp(matches[i], uniques[j]))
break;
}
if (j == uniques_len)
uniques[uniques_len++] = matches[i];
}
for (i = 0; i < uniques_len; ++i)
printf("%s\n", uniques[i]);
Continuing on...
Reset count
to zero after each page. Right after the free(matches); matches = NULL;
would be a good place.
Worth noting. you have no exit case in your outer loop once the file reads start failing, so there will be much slamming on the file that is unable to seek beyond its end. until you reach your limiter count.
Final Thoughts
I think this is close to what you're trying to do:
#define _LARGEFILE64_SOURCE
#include <unistd.h>
#include <stdio.h>
#include <fcntl.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/ptrace.h>
#include <pcre.h>
#include <locale.h>
#include <ctype.h>
int main(int argc, char **argv)
{
// CHANGE TO ACCEPT PROC-ID FROM CMDLINE
int pid = 5916;
setlocale(LC_ALL,"");
const char *error = NULL;
int erroffset = 0;
const char **uniques = NULL;
size_t uniques_len = 0;
const char regex[] = "[0-9A-Fa-f]{8}";
pcre* re = pcre_compile (regex, /* the pattern */
PCRE_MULTILINE|PCRE_DOTALL|PCRE_NEWLINE_ANYCRLF,
&error, /* for error message */
&erroffset, /* for error offset */
0); /* use default character tables */
if (!re)
{
printf("pcre_compile failed (offset: %d), %s\n", erroffset, error);
return -1;
}
// start proc trace
long ptret = ptrace(PTRACE_ATTACH, pid, 0, 0);
if (ptret == -1)
{
fprintf(stderr, "Ptrace failed: %s\n", strerror(errno));
exit(1);
}
char path[256];
snprintf(path, sizeof(path), "/proc/%d/maps", pid);
FILE *maps = fopen(path, "r");
snprintf(path, sizeof(path), "/proc/%d/mem", pid);
int mem = open(path, O_RDONLY);
if(maps && (mem != -1))
{
char buf[BUFSIZ + 1];
while(fgets(buf, BUFSIZ, maps))
{
long long unsigned int start, end;
if (sscanf(buf, "%llx-%llx", &start, &end) != 2)
break;
printf("reading %llx - %llx\n", start, end);
lseek64(mem, start, SEEK_SET);
while (start < end)
{
char page[4096] = {0};
int rd = read(mem, page, sizeof(page));
if (rd < 0)
break;
start += sizeof(page);
int ov[128] = {0};
unsigned int ov_len = 0;
int rc = 0;
while ((rc = pcre_exec(re, 0, page, (int)(rd), ov_len, 0, ov, 128)) >= 0)
{
int i = 0;
for(; i < rc; ++i)
{
const char *sp = NULL;
pcre_get_substring(page, ov, rc, i, &sp);
// search unique list
size_t j=0;
for (;j<uniques_len;++j)
{
if (!strcmp(sp, uniques[j]))
break;
}
if (uniques_len == j)
{
const char **tmp = realloc(uniques, (uniques_len+1)*sizeof(*uniques));
if (tmp == NULL)
{
perror("Failed to resize uniques.");
pcre_free_substring(sp);
}
else
{
uniques = tmp;
uniques[uniques_len++] = sp;
}
}
else
{ // delete string. not needed
pcre_free_substring(sp);
}
}
ov_len = ov[2*(rc-1)]+1;
}
}
}
fclose(maps);
close(mem);
}
size_t n = 0;
for (; n<uniques_len; ++n)
{
printf("%s\n", uniques[n]);
pcre_free_substring(uniques[n]);
}
printf("total uniques: %lu\n", uniques_len);
free(uniques);
ptrace(PTRACE_DETACH, pid, 0, 0);
return 0;
}
Caveat. I know zero about this API, but what I've seen here and briefly reviewed online. YMMV UAYOR. but it seems to be you had it all along. Just accumulate uniques independent of pages (which I think will still be a problem, page boundaries, but thats for another day).