Michiel O. wrote: ↑Wed Jul 17, 2019 7:12 am
jahboater wrote: ↑
Would it be possible to replace the msortchar() function (which looks very slow) with qsort() ?
Yes, that is entirely possible:
$ cat strsort.c
Code: Select all
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
int charcmp(const void *a, const void *b) {
return *(char *)a - *(char *)b;
}
int main(void) {
char s[] = "the quick brown fox etc...";
qsort(s, strlen(s), sizeof(char), charcmp);
printf("%s\n", s);
}
$ cc strsort.c && ./a.out
...bcceefhiknooqrttuwx
Replacement of the msortchar routine by a call to a character qsort as described above made the C program 24 lines shorter and lead to the following:
Code: Select all
/* qanagram.c -- Find anagrams in /usr/share/dict/british-english-insane
Written July 16, 2019 by Eric Olson
This program is an example of how insane it is to work with
pointers and C to find anagrams.
*/
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <ctype.h>
#include <string.h>
#include <strings.h>
static char *words,*keys,*cursor;
static char *skipword(){
for(;*cursor;cursor++){
if(*cursor=='\n') return ++cursor;
}
return 0;
}
static char *getword(){
do {
char *r=cursor;
while(islower(*cursor)) cursor++;
if(*cursor=='\n'){
*cursor++=0;
return r;
}
} while(skipword());
return 0;
}
typedef struct { int p,n; } indtype;
static indtype *ind; static int indmax=0;
static int indcomp(const void *a,const void *b){
int r;
r=strcmp(keys+((indtype *)a)->p,keys+((indtype *)b)->p);
if(r) return r;
return strcmp(words+((indtype *)a)->p,words+((indtype *)b)->p);
}
static int anacomp(const void *a,const void *b){
return strcmp(words+ind[((indtype *)a)->p].p,
words+ind[((indtype *)b)->p].p);
}
static int charcomp(const void *a,const void *b){
return *(char *)a-*(char *)b;
}
int main(){
int fd=open("/usr/share/dict/british-english-insane",O_RDONLY);
struct stat fdstat;
fstat(fd,&fdstat);
words=malloc(fdstat.st_size+1);
read(fd,words,fdstat.st_size);
words[fdstat.st_size]=0;
close(fd);
int lines=2;
for(char *p=words;*p;p++) if(*p=='\n') lines++;
ind=malloc(lines*sizeof(indtype));
cursor=words;
for(;;){
char *word=getword();
if(!word) break;
ind[indmax].p=word-words;
int r=cursor-word-1;
ind[indmax++].n=r;
}
keys=malloc(fdstat.st_size+1);
memcpy(keys,words,fdstat.st_size+1);
for(int i=0;i<indmax;i++){
qsort(keys+ind[i].p,ind[i].n,sizeof(char),charcomp);
}
qsort(ind,indmax,sizeof(indtype),indcomp);
int flag=0;
indtype *ana; int anamax=0;
ana=malloc(sizeof(indtype)*lines/2);
for(int i=1;i<indmax;i++){
if(!strcmp(keys+ind[i-1].p,keys+ind[i].p)){
if(!flag){
ana[anamax].p=i-1;
flag=1;
}
} else {
if(flag){
ana[anamax++].n=i;
flag=0;
}
}
}
if(flag) ana[anamax++].n=indmax;
qsort(ana,anamax,sizeof(indtype),anacomp);
for(int i=0;i<anamax;i++){
int j=ana[i].p;
printf("%s: %s",words+ind[j].p,words+ind[j+1].p);
for(j+=2;j<ana[i].n;j++){
printf(", %s",words+ind[j].p);
}
printf("\n");
}
return 0;
}
Running on the Raspberry Pi 3B+ using the insane British dictionary leads to
Code: Select all
$ time ./qanagram >qanagram.insane
real 0m1.411s
user 0m1.370s
sys 0m0.041s
$ time ./anagram >anagram.insane
real 0m1.093s
user 0m1.023s
sys 0m0.071s
$ time ./anagram.perl >perl.insane
real 0m13.631s
user 0m13.519s
sys 0m0.111s
$ md5sum *.insane
bec74aa3b31577edbb291aeb7269a4d5 anagram.insane
bec74aa3b31577edbb291aeb7269a4d5 perl.insane
bec74aa3b31577edbb291aeb7269a4d5 qanagram.insane
which shows, in spite of the additional green-house gases, that the original code using merge sort is about 1.3 times faster. Both C programs are noticeably faster than the Perl program and produce the same result.
Has anyone tested the approach described in
this post of creating a vector of letter counts rather than sorting?