diffreg.c - 9base - revived minimalist port of Plan 9 userland to Unix
  HTML git clone git://git.suckless.org/9base
   DIR Log
   DIR Files
   DIR Refs
   DIR README
   DIR LICENSE
       ---
       diffreg.c (8845B)
       ---
            1 #include <u.h>
            2 #include <libc.h>
            3 #include <bio.h>
            4 #include "diff.h"
            5 
            6 /*        diff - differential file comparison
            7 *
            8 *        Uses an algorithm due to Harold Stone, which finds
            9 *        a pair of longest identical subsequences in the two
           10 *        files.
           11 *
           12 *        The major goal is to generate the match vector J.
           13 *        J[i] is the index of the line in file1 corresponding
           14 *        to line i file0. J[i] = 0 if there is no
           15 *        such line in file1.
           16 *
           17 *        Lines are hashed so as to work in core. All potential
           18 *        matches are located by sorting the lines of each file
           19 *        on the hash (called value). In particular, this
           20 *        collects the equivalence classes in file1 together.
           21 *        Subroutine equiv replaces the value of each line in
           22 *        file0 by the index of the first element of its 
           23 *        matching equivalence in (the reordered) file1.
           24 *        To save space equiv squeezes file1 into a single
           25 *        array member in which the equivalence classes
           26 *        are simply concatenated, except that their first
           27 *        members are flagged by changing sign.
           28 *
           29 *        Next the indices that point into member are unsorted into
           30 *        array class according to the original order of file0.
           31 *
           32 *        The cleverness lies in routine stone. This marches
           33 *        through the lines of file0, developing a vector klist
           34 *        of "k-candidates". At step i a k-candidate is a matched
           35 *        pair of lines x,y (x in file0 y in file1) such that
           36 *        there is a common subsequence of lenght k
           37 *        between the first i lines of file0 and the first y 
           38 *        lines of file1, but there is no such subsequence for
           39 *        any smaller y. x is the earliest possible mate to y
           40 *        that occurs in such a subsequence.
           41 *
           42 *        Whenever any of the members of the equivalence class of
           43 *        lines in file1 matable to a line in file0 has serial number 
           44 *        less than the y of some k-candidate, that k-candidate 
           45 *        with the smallest such y is replaced. The new 
           46 *        k-candidate is chained (via pred) to the current
           47 *        k-1 candidate so that the actual subsequence can
           48 *        be recovered. When a member has serial number greater
           49 *        that the y of all k-candidates, the klist is extended.
           50 *        At the end, the longest subsequence is pulled out
           51 *        and placed in the array J by unravel.
           52 *
           53 *        With J in hand, the matches there recorded are
           54 *        check'ed against reality to assure that no spurious
           55 *        matches have crept in due to hashing. If they have,
           56 *        they are broken, and "jackpot " is recorded--a harmless
           57 *        matter except that a true match for a spuriously
           58 *        mated line may now be unnecessarily reported as a change.
           59 *
           60 *        Much of the complexity of the program comes simply
           61 *        from trying to minimize core utilization and
           62 *        maximize the range of doable problems by dynamically
           63 *        allocating what is needed and reusing what is not.
           64 *        The core requirements for problems larger than somewhat
           65 *        are (in words) 2*length(file0) + length(file1) +
           66 *        3*(number of k-candidates installed),  typically about
           67 *        6n words for files of length n. 
           68 */
           69 /* TIDY THIS UP */
           70 struct cand {
           71         int x;
           72         int y;
           73         int pred;
           74 } cand;
           75 struct line {
           76         int serial;
           77         int value;
           78 } *file[2], line;
           79 int len[2];
           80 int binary;
           81 struct line *sfile[2];        /*shortened by pruning common prefix and suffix*/
           82 int slen[2];
           83 int pref, suff;        /*length of prefix and suffix*/
           84 int *class;        /*will be overlaid on file[0]*/
           85 int *member;        /*will be overlaid on file[1]*/
           86 int *klist;                /*will be overlaid on file[0] after class*/
           87 struct cand *clist;        /* merely a free storage pot for candidates */
           88 int clen;
           89 int *J;                /*will be overlaid on class*/
           90 long *ixold;        /*will be overlaid on klist*/
           91 long *ixnew;        /*will be overlaid on file[1]*/
           92 /* END OF SOME TIDYING */
           93 
           94 static void        
           95 sort(struct line *a, int n)        /*shellsort CACM #201*/
           96 {
           97         int m;
           98         struct line *ai, *aim, *j, *k;
           99         struct line w;
          100         int i;
          101 
          102         m = 0;
          103         for (i = 1; i <= n; i *= 2)
          104                 m = 2*i - 1;
          105         for (m /= 2; m != 0; m /= 2) {
          106                 k = a+(n-m);
          107                 for (j = a+1; j <= k; j++) {
          108                         ai = j;
          109                         aim = ai+m;
          110                         do {
          111                                 if (aim->value > ai->value ||
          112                                    aim->value == ai->value &&
          113                                    aim->serial > ai->serial)
          114                                         break;
          115                                 w = *ai;
          116                                 *ai = *aim;
          117                                 *aim = w;
          118 
          119                                 aim = ai;
          120                                 ai -= m;
          121                         } while (ai > a && aim >= ai);
          122                 }
          123         }
          124 }
          125 
          126 static void
          127 unsort(struct line *f, int l, int *b)
          128 {
          129         int *a;
          130         int i;
          131 
          132         a = MALLOC(int, (l+1));
          133         for(i=1;i<=l;i++)
          134                 a[f[i].serial] = f[i].value;
          135         for(i=1;i<=l;i++)
          136                 b[i] = a[i];
          137         FREE(a);
          138 }
          139 
          140 static void
          141 prune(void)
          142 {
          143         int i,j;
          144 
          145         for(pref=0;pref<len[0]&&pref<len[1]&&
          146                 file[0][pref+1].value==file[1][pref+1].value;
          147                 pref++ ) ;
          148         for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
          149                 file[0][len[0]-suff].value==file[1][len[1]-suff].value;
          150                 suff++) ;
          151         for(j=0;j<2;j++) {
          152                 sfile[j] = file[j]+pref;
          153                 slen[j] = len[j]-pref-suff;
          154                 for(i=0;i<=slen[j];i++)
          155                         sfile[j][i].serial = i;
          156         }
          157 }
          158 
          159 static void
          160 equiv(struct line *a, int n, struct line *b, int m, int *c)
          161 {
          162         int i, j;
          163 
          164         i = j = 1;
          165         while(i<=n && j<=m) {
          166                 if(a[i].value < b[j].value)
          167                         a[i++].value = 0;
          168                 else if(a[i].value == b[j].value)
          169                         a[i++].value = j;
          170                 else
          171                         j++;
          172         }
          173         while(i <= n)
          174                 a[i++].value = 0;
          175         b[m+1].value = 0;
          176         j = 0;
          177         while(++j <= m) {
          178                 c[j] = -b[j].serial;
          179                 while(b[j+1].value == b[j].value) {
          180                         j++;
          181                         c[j] = b[j].serial;
          182                 }
          183         }
          184         c[j] = -1;
          185 }
          186 
          187 static int
          188 newcand(int x, int  y, int pred)
          189 {
          190         struct cand *q;
          191 
          192         clist = REALLOC(clist, struct cand, (clen+1));
          193         q = clist + clen;
          194         q->x = x;
          195         q->y = y;
          196         q->pred = pred;
          197         return clen++;
          198 }
          199 
          200 static int
          201 search(int *c, int k, int y)
          202 {
          203         int i, j, l;
          204         int t;
          205 
          206         if(clist[c[k]].y < y)        /*quick look for typical case*/
          207                 return k+1;
          208         i = 0;
          209         j = k+1;
          210         while((l=(i+j)/2) > i) {
          211                 t = clist[c[l]].y;
          212                 if(t > y)
          213                         j = l;
          214                 else if(t < y)
          215                         i = l;
          216                 else
          217                         return l;
          218         }
          219         return l+1;
          220 }
          221 
          222 static int
          223 stone(int *a, int n, int *b, int *c)
          224 {
          225         int i, k,y;
          226         int j, l;
          227         int oldc, tc;
          228         int oldl;
          229 
          230         k = 0;
          231         c[0] = newcand(0,0,0);
          232         for(i=1; i<=n; i++) {
          233                 j = a[i];
          234                 if(j==0)
          235                         continue;
          236                 y = -b[j];
          237                 oldl = 0;
          238                 oldc = c[0];
          239                 do {
          240                         if(y <= clist[oldc].y)
          241                                 continue;
          242                         l = search(c, k, y);
          243                         if(l!=oldl+1)
          244                                 oldc = c[l-1];
          245                         if(l<=k) {
          246                                 if(clist[c[l]].y <= y)
          247                                         continue;
          248                                 tc = c[l];
          249                                 c[l] = newcand(i,y,oldc);
          250                                 oldc = tc;
          251                                 oldl = l;
          252                         } else {
          253                                 c[l] = newcand(i,y,oldc);
          254                                 k++;
          255                                 break;
          256                         }
          257                 } while((y=b[++j]) > 0);
          258         }
          259         return k;
          260 }
          261 
          262 static void
          263 unravel(int p)
          264 {
          265         int i;
          266         struct cand *q;
          267 
          268         for(i=0; i<=len[0]; i++) {
          269                 if (i <= pref)
          270                         J[i] = i;
          271                 else if (i > len[0]-suff)
          272                         J[i] = i+len[1]-len[0];
          273                 else
          274                         J[i] = 0;
          275         }
          276         for(q=clist+p;q->y!=0;q=clist+q->pred)
          277                 J[q->x+pref] = q->y+pref;
          278 }
          279 
          280 static void
          281 output(void)
          282 {
          283         int m, i0, i1, j0, j1;
          284 
          285         m = len[0];
          286         J[0] = 0;
          287         J[m+1] = len[1]+1;
          288         if (mode != 'e') {
          289                 for (i0 = 1; i0 <= m; i0 = i1+1) {
          290                         while (i0 <= m && J[i0] == J[i0-1]+1)
          291                                 i0++;
          292                         j0 = J[i0-1]+1;
          293                         i1 = i0-1;
          294                         while (i1 < m && J[i1+1] == 0)
          295                                 i1++;
          296                         j1 = J[i1+1]-1;
          297                         J[i1] = j1;
          298                         change(i0, i1, j0, j1);
          299                 }
          300         }
          301         else {
          302                 for (i0 = m; i0 >= 1; i0 = i1-1) {
          303                         while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0])
          304                                 i0--;
          305                         j0 = J[i0+1]-1;
          306                         i1 = i0+1;
          307                         while (i1 > 1 && J[i1-1] == 0)
          308                                 i1--;
          309                         j1 = J[i1-1]+1;
          310                         J[i1] = j1;
          311                         change(i1 , i0, j1, j0);
          312                 }
          313         }
          314         if (m == 0)
          315                 change(1, 0, 1, len[1]);
          316         flushchanges();
          317 }
          318 
          319 #define BUF 4096
          320 static int
          321 cmp(Biobuf* b1, Biobuf* b2)
          322 {
          323         int n;
          324         uchar buf1[BUF], buf2[BUF];
          325         int f1, f2;
          326         vlong nc = 1;
          327         uchar *b1s, *b1e, *b2s, *b2e;
          328 
          329         f1 = Bfildes(b1);
          330         f2 = Bfildes(b2);
          331         seek(f1, 0, 0);
          332         seek(f2, 0, 0);
          333         b1s = b1e = buf1;
          334         b2s = b2e = buf2;
          335         for(;;){
          336                 if(b1s >= b1e){
          337                         if(b1s >= &buf1[BUF])
          338                                 b1s = buf1;
          339                         n = read(f1, b1s,  &buf1[BUF] - b1s);
          340                         b1e = b1s + n;
          341                 }
          342                 if(b2s >= b2e){
          343                         if(b2s >= &buf2[BUF])
          344                                 b2s = buf2;
          345                         n = read(f2, b2s,  &buf2[BUF] - b2s);
          346                         b2e = b2s + n;
          347                 }
          348                 n = b2e - b2s;
          349                 if(n > b1e - b1s)
          350                         n = b1e - b1s;
          351                 if(n <= 0)
          352                         break;
          353                 if(memcmp((void *)b1s, (void *)b2s, n) != 0){
          354                         return 1;
          355                 }                
          356                 nc += n;
          357                 b1s += n;
          358                 b2s += n;
          359         }
          360         if(b1e - b1s == b2e - b2s)
          361                 return 0;
          362         return 1;        
          363 }
          364 
          365 void
          366 diffreg(char *f, char *t)
          367 {
          368         Biobuf *b0, *b1;
          369         int k;
          370 
          371         binary = 0;
          372         b0 = prepare(0, f);
          373         if (!b0)
          374                 return;
          375         b1 = prepare(1, t);
          376         if (!b1) {
          377                 FREE(file[0]);
          378                 Bterm(b0);
          379                 return;
          380         }
          381         if (binary){
          382                 /* could use b0 and b1 but this is simpler. */
          383                 if (cmp(b0, b1))
          384                         print("binary files %s %s differ\n", f, t);
          385                 Bterm(b0);
          386                 Bterm(b1);
          387                 return;
          388         }
          389         clen = 0;
          390         prune();
          391         sort(sfile[0], slen[0]);
          392         sort(sfile[1], slen[1]);
          393 
          394         member = (int *)file[1];
          395         equiv(sfile[0], slen[0], sfile[1], slen[1], member);
          396         member = REALLOC(member, int, slen[1]+2);
          397 
          398         class = (int *)file[0];
          399         unsort(sfile[0], slen[0], class);
          400         class = REALLOC(class, int, slen[0]+2);
          401 
          402         klist = MALLOC(int, slen[0]+2);
          403         clist = MALLOC(struct cand, 1);
          404         k = stone(class, slen[0], member, klist);
          405         FREE(member);
          406         FREE(class);
          407 
          408         J = MALLOC(int, len[0]+2);
          409         unravel(klist[k]);
          410         FREE(clist);
          411         FREE(klist);
          412 
          413         ixold = MALLOC(long, len[0]+2);
          414         ixnew = MALLOC(long, len[1]+2);
          415         Bseek(b0, 0, 0); Bseek(b1, 0, 0);
          416         check(b0, b1);
          417         output();
          418         FREE(J); FREE(ixold); FREE(ixnew);
          419         Bterm(b0); Bterm(b1);                        /* ++++ */
          420 }