Scripts/Fast File Read
< Scripts
Jump to navigation
Jump to search
Fast file Reading
Fast extraction of a relatively small amount of data from a large file is easy with J, using mapped files.
For example, extracting distinct ip addresses from a log file can be as follows:
J version: Extract distinct IP addresses following a label from a large file
findInFile=: 4 : 0
NB. find data in a file
NB. x: label preceding data
NB. y: the file name
NB. the data is followed by a blank space
JCHAR map_jmf_ 'file';y NB. mapped files realy speed things up
nos=. x I.@:E. file NB. find the positions of the label in file
ip=.(nos+/(#x)+i.16){file NB. matrix with maximum no. of columns
unmap_jmf_ 'file'
~.({."0 1~ (i."1 &' '))ip NB. extract data from each line until a blank is found
NB. return the unique values
)
On a small Acer Aspire One J took about 0.65 secs to extract 16000 ip addresses and the 10 distinct ip addresses from a 38 meg file.
This program could have been written in C, saving perhaps 0.30 secs, but with a bit more effort.
test=: 3 : 0
file=:'testfile.2'
out=.,(20000 2000$' '),.~' rhost=',"1(' '-.~"1(}:"1 (20000 16$,'.',"1~":>:?40 1$255)))
out fwrite file
label =.' rhost='
label findInFile file
ferase file
)
C version: Extract distinct IP addresses following a label from a large file
//////////////////////////////////////////////////////
// //
// Small C mmap() sample. //
// Written by Martin Cyr. //
// Feel free to change and distribute, but credit //
// is always nice. If you use, I'd be pleased to //
// hear from you at Spooles at GMail dot com. //
// //
//////////////////////////////////////////////////////
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <sys/stat.h>
#include <sys/fcntl.h>
#define INITIAL_CAPACITY 25
#define IP_CHAR_LEN 16
#define LINE_CHAR_LEN 1024
#define PATTERN " rhost="
void showUsage();
int processFile(char*, char***);
int addNextHost(char***, int, int*, char*, int);
int countMatches(char*, char**, int);
void printDestroyArray(char**, int);
int main(int argc, char** argv)
{
char** hosts;
int hostCount;
if (argc <= 1)
showUsage(argv[0]);
else if (argc == 2)
{
hostCount = processFile(argv[1], &hosts);
printDestroyArray(hosts, hostCount);
free(hosts);
}
else
showUsage(argv[0]);
}
void showUsage(char* filename)
{
printf("Usage: %s <filename>\n", filename);
printf("\tParses the <filename> for occurences of rhost= \n");
printf("\tand sends everything to stdout\n");
}
int countMatches(char* match, char** array, int count)
{
int i, ret = 0;
for (i = 0; i < count; i++)
{
if (strcmp(match, array[i]) == 0)
ret++;
}
return ret;
}
void printDestroyArray(char** array, int count)
{
int i;
for (i = 0; i < count; i++)
{
printf("%s\n", array[i]);
free(array[i]);
}
}
int processFile(char* filename, char*** hosts)
{
int c;
int hostCount = 0, hostMax = INITIAL_CAPACITY;
int match = 0, patternLen = strlen(PATTERN);
int fd;
int result;
int i;
char* map;
struct stat results;
(*hosts) = (char**)malloc(hostMax * IP_CHAR_LEN * sizeof(char));
fd = open(filename, O_RDONLY);
if (fd == -1)
{
perror("Error opening file");
exit(EXIT_FAILURE);
}
if (stat(filename, &results) != 0)
{
perror("Unable to get file stats");
exit(EXIT_FAILURE);
}
map = mmap(0, results.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
if (map == MAP_FAILED)
{
perror("Error mapping the file");
exit(EXIT_FAILURE);
}
for (i = 0; i < results.st_size; i++)
{
if (map[i] != PATTERN[match++])
match = 0;
if (match == patternLen)
{
hostCount = addNextHost(hosts, hostCount, &hostMax, map, i+1);
match = 0;
}
}
if (munmap(map, results.st_size) == -1)
{
perror("Error unmapping the file");
}
close(fd);
return hostCount;
}
int addNextHost(char*** hosts, int hostNum, int* hostMax, char* map, int offset)
{
char host[IP_CHAR_LEN];
int pos = 0;
if (hostNum > *hostMax)
{
*hostMax *= 2;
*hosts = (char**)realloc(*hosts, IP_CHAR_LEN * (*hostMax) * sizeof(char));
}
while ((map[offset+pos] != ' ') && (map[offset+pos] != '\n') && (map[offset+pos] != '\r') && (map[offset+pos] != '\t'))
{
host[pos] = map[offset+pos];
pos++;
}
host[pos] = 0;
if ((pos > 0) && (countMatches(host, *hosts, hostNum) == 0))
{
(*hosts)[hostNum] = (char*)calloc(IP_CHAR_LEN, sizeof(char));
strncpy((*hosts)[hostNum], host, pos);
hostNum++;
}
return hostNum;
}