sort.c

Go to the documentation of this file.
00001 /* sort - sort a file of lines          Author: Michiel Huisjes */
00002 
00003 /* SYNOPSIS:
00004  *      sort [-funbirdcmt'x'] [+beg_pos[opts] [-end_pos]] [-o outfile] [file]..
00005  *
00006  *      [opts] can be any of
00007  *      -f : Fold upper case to lower.
00008  *      -n : Sort to numeric value (optional decimal point) implies -b
00009  *      -b : Skip leading blanks
00010  *      -i : Ignore chars outside ASCII range (040 - 0176)
00011  *      -r : Reverse the sense of comparisons.
00012  *      -d : Sort to dictionary order. Only letters, digits, comma's and points
00013  *           are compared.
00014  *      If any of these flags are used in [opts], then they override all global
00015  *      ordering for this field.
00016  *
00017  *      I/O control flags are:
00018  *      -u : Print uniq lines only once.
00019  *      -c : Check if files are sorted in order.
00020  *      -m : Merge already sorted files.
00021  *      -o outfile : Name of output file. (Can be one of the input files).
00022  *                   Default is stdout.
00023  *      - : Take stdin as input.
00024  *
00025  *      Fields:
00026  *      -t'x' : Field separating character is 'x'
00027  *      +a.b : Start comparing at field 'a' with offset 'b'. A missing 'b' is
00028  *             taken to be 0.
00029  *      -a.b : Stop comparing at field 'a' with offset 'b'. A missing 'b' is
00030  *             taken to be 0.
00031  *      A missing -a.b means the rest of the line.
00032  */
00033 
00034 #include <sys/types.h>
00035 #include <sys/stat.h>
00036 #include <fcntl.h>
00037 #include <signal.h>
00038 #include <unistd.h>
00039 #include <stdlib.h>
00040 #include <string.h>
00041 #include <stdio.h>
00042 #include <limits.h>
00043 
00044 #define OPEN_FILES      (OPEN_MAX-4)    /* Nr of open files per process */
00045 #if __minix_vmd
00046 #define MEMORY_SIZE     (1024 * 1024)
00047 #else
00048 #define MEMORY_SIZE     ((10 * sizeof(int)) * 1024)
00049 #endif
00050                                         /* Total mem_size */
00051 #define LINE_SIZE       (1024 >> 1)     /* Max length of a line */
00052 #define IO_SIZE         (2 * 1024)      /* Size of buffered output */
00053 #define STD_OUT          1      /* Fd of terminal */
00054 
00055 /* Return status of functions */
00056 #define OK               0
00057 #define ERROR           -1
00058 #define NIL_PTR         ((char *) 0)
00059 
00060 /* Compare return values */
00061 #define LOWER           -1
00062 #define SAME             0
00063 #define HIGHER           1
00064 
00065 /* Table definitions. */
00066 #define DICT            0x001   /* Alpha, numeric, letters and . */
00067 #define ASCII           0x002   /* All between ' ' and '~' */
00068 #define BLANK           0x004   /* ' ' and '\t' */
00069 #define DIGIT           0x008   /* 0-9 */
00070 #define UPPER           0x010   /* A-Z */
00071 
00072 typedef int BOOL;
00073 
00074 #define FALSE   0
00075 #define TRUE    1
00076 
00077 typedef struct {
00078   int fd;                       /* Fd of file */
00079   char *buffer;                 /* Buffer for reads */
00080   int read_chars;               /* Nr of chars actually read in buffer */
00081   int cnt;                      /* Nr of chars taken out of buffer */
00082   char *line;                   /* Contains line currently used */
00083 } MERGE;
00084 
00085 #define NIL_MERGE       ((MERGE *) 0)
00086 MERGE merge_f[OPEN_FILES];      /* Merge structs */
00087 int buf_size;                   /* Size of core available for each struct */
00088 
00089 #define FIELDS_LIMIT    10      /* 1 global + 9 user */
00090 #define GLOBAL           0
00091 
00092 typedef struct {
00093   int beg_field, beg_pos;       /* Begin field + offset */
00094   int end_field, end_pos;       /* End field + offset. ERROR == EOLN */
00095   BOOL reverse;                 /* TRUE if rev. flag set on this field */
00096   BOOL blanks;
00097   BOOL dictionary;
00098   BOOL fold_case;
00099   BOOL ascii;
00100   BOOL numeric;
00101 } FIELD;
00102 
00103 /* Field declarations. A total of FILEDS_LIMIT is allowed */
00104 FIELD fields[FIELDS_LIMIT];
00105 int field_cnt;                  /* Nr of field actually assigned */
00106 
00107 /* Various output control flags */
00108 BOOL check = FALSE;
00109 BOOL only_merge = FALSE;
00110 BOOL uniq = FALSE;
00111 
00112 char *mem_top;                  /* Mem_top points to lowest pos of memory. */
00113 char *cur_pos;                  /* First free position in mem */
00114 char **line_table;              /* Pointer to the internal line table */
00115 BOOL in_core = TRUE;            /* Set if input cannot all be sorted in core */
00116 
00117  /* Place where temp_files should be made */
00118 char temp_files[] = "/tmp/sort.XXXXX.XX";
00119 char *output_file;              /* Name of output file */
00120 int out_fd;                     /* Fd to output file (could be STD_OUT) */
00121 char out_buffer[IO_SIZE];       /* For buffered output */
00122 
00123 char **argptr;                  /* Pointer to argv structure */
00124 int args_offset;                /* Nr of args spilled on options */
00125 int args_limit;                 /* Nr of args given */
00126 
00127 char separator;                 /* Char that separates fields */
00128 int nr_of_files = 0;            /* Nr_of_files to be merged */
00129 int disabled;                   /* Nr of files done */
00130 
00131 char USAGE[] = "Usage: sort [-funbirdcmt'x'] [+beg_pos [-end_pos]] [-o outfile] [file] ..";
00132 
00133 /* Forward declarations */
00134 _PROTOTYPE(int main, (int argc, char **argv));
00135 _PROTOTYPE(void get_opts, (char *ptr, FIELD * field));
00136 _PROTOTYPE(void new_field, (FIELD * field, int *offset, BOOL beg_fl));
00137 _PROTOTYPE(void adjust_options, (FIELD * field));
00138 _PROTOTYPE(void error, (BOOL quit, char *message, char *arg));
00139 _PROTOTYPE(void open_outfile, (void));
00140 _PROTOTYPE(void get_file, (int fd, off_t size));
00141 _PROTOTYPE(int last_line, (void));
00142 _PROTOTYPE(void print_table, (int fd));
00143 _PROTOTYPE(char *file_name, (int nr));
00144 _PROTOTYPE(void mread, (int fd, char *address, int bytes));
00145 _PROTOTYPE(void mwrite, (int fd, char *address, int bytes));
00146 _PROTOTYPE(void sort, (void));
00147 _PROTOTYPE(void sort_table, (int nel));
00148 _PROTOTYPE(void incr, (int si, int ei));
00149 _PROTOTYPE(int cmp_fields, (char *el1, char *el2));
00150 _PROTOTYPE(void build_field, (char *dest, FIELD * field, char *src));
00151 _PROTOTYPE(char *skip_fields, (char *str, int nf));
00152 _PROTOTYPE(int compare, (char *el1, char *el2));
00153 _PROTOTYPE(int cmp, (unsigned char *el1, unsigned char *el2, FIELD * field));
00154 _PROTOTYPE(int digits, (char *str1, char *str2, BOOL check_sign));
00155 _PROTOTYPE(void files_merge, (int file_cnt));
00156 _PROTOTYPE(void merge, (int start_file, int limit_file));
00157 _PROTOTYPE(void put_line, (char *line));
00158 _PROTOTYPE(MERGE * print, (MERGE * merg, int file_cnt));
00159 _PROTOTYPE(int read_line, (MERGE * merg));
00160 _PROTOTYPE(MERGE * skip_lines, (MERGE * smallest, int file_cnt));
00161 _PROTOTYPE(void uniq_lines, (MERGE * merg));
00162 _PROTOTYPE(void check_file, (int fd, char *file));
00163 _PROTOTYPE(int length, (char *line));
00164 _PROTOTYPE(void copy, (char *dest, char *src));
00165 _PROTOTYPE(char *msbrk, (int size));
00166 _PROTOTYPE(void mbrk, (char *address));
00167 _PROTOTYPE(void catch, (int dummy));
00168 
00169 /* Table of all chars. 0 means no special meaning. */
00170 char table[256] = {
00171 /* '^@' to space */
00172            0, 0, 0, 0, 0, 0, 0, 0,
00173            0, BLANK | DICT, 0, 0, 0, 0, 0, 0,
00174            0, 0, 0, 0, 0, 0, 0, 0,
00175            0, 0, 0, 0, 0, 0, 0, 0,
00176 
00177 /* Space to '0' */
00178        BLANK | DICT | ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,
00179            ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,
00180            ASCII, ASCII,
00181 
00182 /* '0' until '9' */
00183      DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII,
00184      DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII,
00185      DIGIT | DICT | ASCII, DIGIT | DICT | ASCII, DIGIT | DICT | ASCII,
00186            DIGIT | DICT | ASCII,
00187 
00188 /* ASCII from ':' to '@' */
00189            ASCII, ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,
00190 
00191 /* Upper case letters 'A' to 'Z' */
00192      UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
00193      UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
00194      UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
00195      UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
00196      UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
00197      UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
00198      UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
00199      UPPER | DICT | ASCII, UPPER | DICT | ASCII, UPPER | DICT | ASCII,
00200            UPPER | DICT | ASCII, UPPER | DICT | ASCII,
00201 
00202 /* ASCII from '[' to '`' */
00203            ASCII, ASCII, ASCII, ASCII, ASCII, ASCII,
00204 
00205 /* Lower case letters from 'a' to 'z' */
00206            DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
00207            DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
00208            DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
00209            DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
00210            DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
00211            DICT | ASCII, DICT | ASCII, DICT | ASCII, DICT | ASCII,
00212            DICT | ASCII, DICT | ASCII,
00213 
00214 /* ASCII from '{' to '~' */
00215            ASCII, ASCII, ASCII, ASCII,
00216 
00217 /* Stuff from -1 to -177 */
00218            0, 0, 0, 0, 0, 0, 0, 0, 0,
00219            0, 0, 0, 0, 0, 0, 0, 0, 0,
00220            0, 0, 0, 0, 0, 0, 0, 0, 0,
00221            0, 0, 0, 0, 0, 0, 0, 0, 0,
00222            0, 0, 0, 0, 0, 0, 0, 0, 0,
00223            0, 0, 0, 0, 0, 0, 0, 0, 0,
00224            0, 0, 0, 0, 0, 0, 0, 0, 0,
00225            0, 0, 0, 0, 0, 0, 0, 0, 0,
00226            0, 0, 0, 0, 0, 0, 0
00227 };
00228 
00229 
00230 /*
00231  * Get_opts () assigns the options into the field structure as described in ptr.
00232  * This field structure could be the GLOBAL one.
00233  */
00234 void get_opts(ptr, field)
00235 register char *ptr;
00236 register FIELD *field;
00237 {
00238   switch (*ptr) {
00239       case 'b':                 /* Skip leading blanks */
00240         field->blanks = TRUE;
00241         break;
00242       case 'd':                 /* Dictionary order */
00243         field->dictionary = TRUE;
00244         break;
00245       case 'f':                 /* Fold upper case to lower */
00246         field->fold_case = TRUE;
00247         break;
00248       case 'i':                 /* Skip chars outside ' ' '~' */
00249         field->ascii = TRUE;
00250         break;
00251       case 'n':                 /* Sort on numeric */
00252         field->numeric = TRUE;
00253         field->blanks = TRUE;
00254         break;
00255       case 'r':                 /* Reverse comparisons */
00256         field->reverse = TRUE;
00257         break;
00258       default:                  /* Illegal options */
00259         error(TRUE, USAGE, NIL_PTR);
00260   }
00261 }
00262 
00263 /* New_field () assigns a new field as described by the arguments.
00264  * A field description is of the form: +a.b[opts] -c.d, where b and d, as well
00265  * as -c.d and [opts] are optional. Nr before digit is field nr. Nr after digit
00266  * is offset from field.
00267  */
00268 void new_field(field, offset, beg_fl)
00269 register FIELD *field;          /* Field to assign */
00270 int *offset;                    /* Offset in argv structure */
00271 BOOL beg_fl;                    /* Assign beg or end of field */
00272 {
00273   register char *ptr;
00274 
00275   ptr = argptr[*offset];
00276   *offset += 1;                 /* Incr offset to next arg */
00277   ptr++;
00278 
00279   if (beg_fl)
00280         field->beg_field = atoi(ptr);   /* Assign int of first field */
00281   else
00282         field->end_field = atoi(ptr);
00283 
00284   while (table[*ptr] & DIGIT)   /* Skip all digits */
00285         ptr++;
00286 
00287   if (*ptr == '.') {            /* Check for offset */
00288         ptr++;
00289         if (beg_fl)
00290                 field->beg_pos = atoi(ptr);
00291         else
00292                 field->end_pos = atoi(ptr);
00293         while (table[*ptr] & DIGIT)     /* Skip digits */
00294                 ptr++;
00295   }
00296   if (beg_fl) {
00297         while (*ptr != '\0')    /* Check options after field */
00298                 get_opts(ptr++, field);
00299   }
00300   if (beg_fl) {                 /* Check for end pos */
00301         ptr = argptr[*offset];
00302         if (ptr && *ptr == '-' && ((table[*(ptr + 1)] & DIGIT) || *(ptr + 1) == '.')) {
00303                 new_field(field, offset, FALSE);
00304                 if (field->beg_field > field->end_field)
00305                         error(TRUE, "End field is before start field!", NIL_PTR);
00306         } else                  /* No end pos. */
00307                 field->end_field = ERROR;
00308   }
00309 }
00310 
00311 int main(argc, argv)
00312 int argc;
00313 char *argv[];
00314 {
00315   int arg_count = 1;            /* Offset in argv */
00316   struct stat st;
00317   register char *ptr;           /* Ptr to *argv in use */
00318   register int fd;
00319   int pid, pow;
00320 
00321   argptr = argv;
00322   cur_pos = mem_top = msbrk(MEMORY_SIZE);       /* Find lowest mem. location */
00323 
00324   while (arg_count < argc && ((ptr = argv[arg_count])[0] == '-' || *ptr == '+')) {
00325         if (*ptr == '-' && *(ptr + 1) == '\0')  /* "-" means stdin */
00326                 break;
00327         if (*ptr == '+') {      /* Assign field. */
00328                 if (++field_cnt == FIELDS_LIMIT)
00329                         error(TRUE, "Too many fields", NIL_PTR);
00330                 new_field(&fields[field_cnt], &arg_count, TRUE);
00331         } else {                /* Get output options */
00332                 while (*++ptr) {
00333                         switch (*ptr) {
00334                             case 'c':   /* Only check file */
00335                                 check = TRUE;
00336                                 break;
00337                             case 'm':   /* Merge (sorted) files */
00338                                 only_merge = TRUE;
00339                                 break;
00340                             case 'u':   /* Only give uniq lines */
00341                                 uniq = TRUE;
00342                                 break;
00343                             case 'o':   /* Name of output file */
00344                                 output_file = argv[++arg_count];
00345                                 break;
00346                             case 't':   /* Field separator */
00347                                 ptr++;
00348                                 separator = *ptr;
00349                                 break;
00350                             default:    /* Sort options */
00351                                 get_opts(ptr, &fields[GLOBAL]);
00352                         }
00353                 }
00354                 arg_count++;
00355         }
00356   }
00357 
00358   for (fd = 1; fd <= field_cnt; fd++) adjust_options(&fields[fd]);
00359 
00360 /* Create name of tem_files 'sort.pid.aa' */
00361   ptr = &temp_files[10];
00362   pid = getpid();
00363   pow = 10000;
00364   while (pow != 0) {
00365         *ptr++ = pid / pow + '0';
00366         pid %= pow;
00367         pow /= 10;
00368   }
00369 
00370   signal(SIGINT, catch);
00371 
00372 /* Only merge files. Set up */
00373   if (only_merge) {
00374         args_limit = args_offset = arg_count;
00375         while (argv[args_limit] != NIL_PTR)
00376                 args_limit++;   /* Find nr of args */
00377         files_merge(args_limit - arg_count);
00378         exit(0);
00379   }
00380   if (arg_count == argc) {      /* No args left. Use stdin */
00381         if (check)
00382                 check_file(0, NIL_PTR);
00383         else
00384                 get_file(0, (off_t) 0);
00385   } else
00386         while (arg_count < argc) {      /* Sort or check args */
00387                 if (strcmp(argv[arg_count], "-") == 0)
00388                         fd = 0;
00389                 else if (stat(argv[arg_count], &st) < 0) {
00390                         error(FALSE, "Cannot find ", argv[arg_count++]);
00391                         continue;
00392                 }
00393 
00394                 /* Open files */
00395                 else if ((fd = open(argv[arg_count], O_RDONLY)) < 0) {
00396                         error(FALSE, "Cannot open ", argv[arg_count++]);
00397                         continue;
00398                 }
00399                 if (check)
00400                         check_file(fd, argv[arg_count]);
00401                 else            /* Get_file reads whole file */
00402                         get_file(fd, st.st_size);
00403                 arg_count++;
00404         }
00405 
00406   if (check) exit(0);
00407 
00408   sort();                       /* Sort whatever is left */
00409 
00410   if (nr_of_files == 1)         /* Only one file sorted -> don't merge */
00411         exit(0);
00412 
00413   files_merge(nr_of_files);
00414   return(0);
00415 }
00416 
00417 /* Adjust_options() assigns all global variables set also in the fields
00418  * assigned.
00419  */
00420 void adjust_options(field)
00421 register FIELD *field;
00422 {
00423   register FIELD *gfield = &fields[GLOBAL];
00424 
00425   if (gfield->reverse) field->reverse = TRUE;
00426   if (gfield->blanks) field->blanks = TRUE;
00427   if (gfield->dictionary) field->dictionary = TRUE;
00428   if (gfield->fold_case) field->fold_case = TRUE;
00429   if (gfield->ascii) field->ascii = TRUE;
00430   if (gfield->numeric) field->numeric = TRUE;
00431 }
00432 
00433 /* Error () prints the error message on stderr and exits if quit == TRUE. */
00434 void error(quit, message, arg)
00435 register BOOL quit;
00436 register char *message, *arg;
00437 {
00438   write(2, message, strlen(message));
00439   if (arg != NIL_PTR) write(2, arg, strlen(arg));
00440   perror(" ");
00441   if (quit) exit(1);
00442 }
00443 
00444 /* Open_outfile () assigns to out_fd the fd where the output must go when all
00445  * the sorting is done.
00446  */
00447 void open_outfile()
00448 {
00449   if (output_file == NIL_PTR)
00450         out_fd = STD_OUT;
00451   else if ((out_fd = creat(output_file, 0644)) < 0)
00452         error(TRUE, "Cannot creat ", output_file);
00453 }
00454 
00455 /* Get_file reads the whole file of filedescriptor fd. If the file is too big
00456  * to keep in core, a partial sort is done, and the output is stashed somewhere.
00457  */
00458 void get_file(fd, size)
00459 int fd;                         /* Fd of file to read */
00460 register off_t size;            /* Size of file */
00461 {
00462   register int i;
00463   int rest;                     /* Rest in memory */
00464   char save_ch;                 /* Used in stdin readings */
00465 
00466   rest = MEMORY_SIZE - (cur_pos - mem_top);
00467   if (fd == 0) {                /* We're reding stdin */
00468         while ((i = read(0, cur_pos, rest)) > 0) {
00469                 if ((cur_pos - mem_top) + i == MEMORY_SIZE) {
00470                         in_core = FALSE;
00471                         i = last_line();        /* End of last line */
00472                         save_ch = mem_top[i];
00473                         mem_top[i] = '\0';
00474                         sort(); /* Sort core */
00475                         mem_top[i] = save_ch;   /* Restore erased char */
00476                         /* Restore last (half read) line */
00477                         for (rest = 0; i + rest != MEMORY_SIZE; rest++)
00478                                 mem_top[rest] = mem_top[i + rest];
00479                         /* Assign current pos. in memory */
00480                         cur_pos = &mem_top[rest];
00481                 } else {        /* Fits, just assign position in mem. */
00482                         cur_pos = cur_pos + i;
00483                         *cur_pos = '\0';
00484                 }
00485 
00486                 /* Calculate rest of mem */
00487                 rest = MEMORY_SIZE - (cur_pos - mem_top);
00488         }
00489   }
00490 
00491   /* Reading file. Check size */
00492   else if (size > rest) {       /* Won't fit */
00493         mread(fd, cur_pos, rest);
00494         in_core = FALSE;
00495         i = last_line();        /* Get pos. of last line */
00496         mem_top[i] = '\0';      /* Truncate */
00497         (void) lseek(fd, (off_t) (i - MEMORY_SIZE), SEEK_CUR);  /* Do this next time */
00498         size = size - rest - i + MEMORY_SIZE;   /* Calculate rest */
00499         cur_pos = mem_top;      /* Reset mem */
00500         sort();                 /* Sort core */
00501         get_file(fd, size);     /* Get rest of file */
00502   } else {                      /* Fits. Just read in */
00503         rest = size;
00504         mread(fd, cur_pos, rest);
00505         cur_pos = cur_pos + rest;       /* Reassign cur_pos */
00506         *cur_pos = '\0';
00507         (void) close(fd);       /* File completed */
00508   }
00509 }
00510 
00511 /* Last_line () find the last line in core and retuns the offset from the top
00512  * of the memory.
00513  */
00514 int last_line()
00515 {
00516   register int i;
00517 
00518   for (i = MEMORY_SIZE - 2; i > 0; i--)
00519         if (mem_top[i] == '\n') break;
00520   return i + 1;
00521 }
00522 
00523 /* Print_table prints the line table in the given file_descriptor. If the fd
00524  * equals ERROR, it opens a temp_file itself.
00525  */
00526 void print_table(fd)
00527 int fd;
00528 {
00529   register char **line_ptr;     /* Ptr in line_table */
00530   register char *ptr;           /* Ptr to line */
00531   int index = 0;                /* Index in output buffer */
00532 
00533   if (fd == ERROR) {
00534         if ((fd = creat(file_name(nr_of_files), 0644)) < 0)
00535                 error(TRUE, "Cannot creat ", file_name(nr_of_files));
00536   }
00537   for (line_ptr = line_table; *line_ptr != NIL_PTR; line_ptr++) {
00538         ptr = *line_ptr;
00539         /* Skip all same lines if uniq is set */
00540         if (uniq && *(line_ptr + 1) != NIL_PTR) {
00541                 if (compare(ptr, *(line_ptr + 1)) == SAME) continue;
00542         }
00543         do {                    /* Print line in a buffered way */
00544                 out_buffer[index++] = *ptr;
00545                 if (index == IO_SIZE) {
00546                         mwrite(fd, out_buffer, IO_SIZE);
00547                         index = 0;
00548                 }
00549         } while (*ptr++ != '\n');
00550   }
00551   mwrite(fd, out_buffer, index);/* Flush buffer */
00552   (void) close(fd);             /* Close file */
00553   nr_of_files++;                /* Increment nr_of_files to merge */
00554 }
00555 
00556 /* File_name () returns the nr argument from the argument list, or a uniq
00557  * filename if the nr is too high, or the arguments were not merge files.
00558  */
00559 char *file_name(nr)
00560 register int nr;
00561 {
00562   if (only_merge) {
00563         if (args_offset + nr < args_limit) return argptr[args_offset + nr];
00564   }
00565   temp_files[16] = nr / 26 + 'a';
00566   temp_files[17] = nr % 26 + 'a';
00567 
00568   return temp_files;
00569 }
00570 
00571 /* Mread () performs a normal read (), but checks the return value. */
00572 void mread(fd, address, bytes)
00573 int fd;
00574 char *address;
00575 register int bytes;
00576 {
00577   if (read(fd, address, bytes) < 0 && bytes != 0)
00578         error(TRUE, "Read error", NIL_PTR);
00579 }
00580 
00581 /* Mwrite () performs a normal write (), but checks the return value. */
00582 void mwrite(fd, address, bytes)
00583 int fd;
00584 char *address;
00585 register int bytes;
00586 {
00587   if (write(fd, address, bytes) != bytes && bytes != 0)
00588         error(TRUE, "Write error", NIL_PTR);
00589 }
00590 
00591 /* Sort () sorts the input in memory starting at mem_top. */
00592 void sort()
00593 {
00594   register char *ptr = mem_top;
00595   register int count = 0;
00596 
00597 /* Count number of lines in memory */
00598   while (*ptr) {
00599         if (*ptr++ == '\n') count++;
00600   }
00601 
00602 /* Set up the line table */
00603   line_table = (char **) msbrk(count * sizeof(char *) + sizeof(char *));
00604 
00605   count = 1;
00606   ptr = line_table[0] = mem_top;
00607   while (*ptr) {
00608         if (*ptr++ == '\n') line_table[count++] = ptr;
00609   }
00610 
00611   line_table[count - 1] = NIL_PTR;
00612 
00613 /* Sort the line table */
00614   sort_table(count - 1);
00615 
00616 /* Stash output somewhere */
00617   if (in_core) {
00618         open_outfile();
00619         print_table(out_fd);
00620   } else
00621         print_table(ERROR);
00622 
00623 /* Free line table */
00624   mbrk((char *) line_table);
00625 }
00626 
00627 /* Sort_table () sorts the line table consisting of nel elements. */
00628 void sort_table(nel)
00629 register int nel;
00630 {
00631   char *tmp;
00632   register int i;
00633 
00634   /* Make heap */
00635   for (i = (nel >> 1); i >= 1; i--) incr(i, nel);
00636 
00637   /* Sort from heap */
00638   for (i = nel; i > 1; i--) {
00639         tmp = line_table[0];
00640         line_table[0] = line_table[i - 1];
00641         line_table[i - 1] = tmp;
00642         incr(1, i - 1);
00643   }
00644 }
00645 
00646 /* Incr () increments the heap. */
00647 void incr(si, ei)
00648 register int si, ei;
00649 {
00650   char *tmp;
00651 
00652   while (si <= (ei >> 1)) {
00653         si <<= 1;
00654         if (si + 1 <= ei && compare(line_table[si - 1], line_table[si]) <= 0)
00655                 si++;
00656         if (compare(line_table[(si >> 1) - 1], line_table[si - 1]) >= 0)
00657                 return;
00658         tmp = line_table[(si >> 1) - 1];
00659         line_table[(si >> 1) - 1] = line_table[si - 1];
00660         line_table[si - 1] = tmp;
00661   }
00662 }
00663 
00664 /* Cmp_fields builds new lines out of the lines pointed to by el1 and el2 and
00665  * puts it into the line1 and line2 arrays. It then calls the cmp () routine
00666  * with the field describing the arguments.
00667  */
00668 int cmp_fields(el1, el2)
00669 register char *el1, *el2;
00670 {
00671   int i, ret;
00672   char line1[LINE_SIZE], line2[LINE_SIZE];
00673 
00674   for (i = 0; i < field_cnt; i++) {     /* Setup line parts */
00675         build_field(line1, &fields[i + 1], el1);
00676         build_field(line2, &fields[i + 1], el2);
00677         if ((ret = cmp((unsigned char *) line1, (unsigned char *) line2,
00678                        &fields[i + 1])) != SAME)
00679                 break;          /* If equal, try next field */
00680   }
00681 
00682 /* Check for reverse flag */
00683   if (i != field_cnt && fields[i + 1].reverse) return -ret;
00684 
00685 /* Else return the last return value of cmp () */
00686   return ret;
00687 }
00688 
00689 /* Build_field builds a new line from the src as described by the field.
00690  * The result is put in dest.
00691  */
00692 void build_field(dest, field, src)
00693 char *dest;                     /* Holds result */
00694 register FIELD *field;          /* Field description */
00695 register char *src;             /* Source line */
00696 {
00697   char *begin = src;            /* Remember start location */
00698   char *last;                   /* Pointer to end location */
00699   int i;
00700 
00701 /* Skip begin fields */
00702   src = skip_fields(src, field->beg_field);
00703 
00704 /* Skip begin positions */
00705   for (i = 0; i < field->beg_pos && *src != '\n'; i++) src++;
00706 
00707 /* Copy whatever is left */
00708   copy(dest, src);
00709 
00710 /* If end field is assigned truncate (perhaps) the part copied */
00711   if (field->end_field != ERROR) {      /* Find last field */
00712         last = skip_fields(begin, field->end_field);
00713 /* Skip positions as given by end fields description */
00714         for (i = 0; i < field->end_pos && *last != '\n'; i++) last++;
00715         dest[last - src] = '\n';/* Truncate line */
00716   }
00717 }
00718 
00719 /* Skip_fields () skips nf fields of the line pointed to by str. */
00720 char *skip_fields(str, nf)
00721 register char *str;
00722 int nf;
00723 {
00724   while (nf-- > 0) {
00725         if (separator == '\0') {/* Means ' ' or '\t' */
00726                 while (*str != ' ' && *str != '\t' && *str != '\n') str++;
00727                 while (table[*str] & BLANK) str++;
00728         } else {
00729                 while (*str != separator && *str != '\n') str++;
00730                 if (*str == separator) str++;
00731         }
00732   }
00733   return str;                   /* Return pointer to indicated field */
00734 }
00735 
00736 /* Compare is called by all sorting routines. It checks if fields assignments
00737  * has been made. if so, it calls cmp_fields (). If not, it calls cmp () and
00738  * reversed the return value if the (global) reverse flag is set.
00739  */
00740 int compare(el1, el2)
00741 register char *el1, *el2;
00742 {
00743   int ret;
00744 
00745   if (field_cnt > GLOBAL) return cmp_fields(el1, el2);
00746 
00747   ret = cmp((unsigned char *) el1, (unsigned char *) el2, &fields[GLOBAL]);
00748   return(fields[GLOBAL].reverse) ? -ret : ret;
00749 }
00750 
00751 /* Cmp () is the actual compare routine. It compares according to the
00752  * description given in the field pointer.
00753  */
00754 int cmp(el1, el2, field)
00755 register unsigned char *el1, *el2;
00756 FIELD *field;
00757 {
00758   int c1, c2;
00759 
00760   if (field->blanks) {          /* Skip leading blanks */
00761         while (table[*el1] & BLANK) el1++;
00762         while (table[*el2] & BLANK) el2++;
00763   }
00764   if (field->numeric)           /* Compare numeric */
00765         return digits((char *) el1, (char *) el2, TRUE);
00766 
00767   for (;;) {
00768         while (*el1 == *el2) {
00769                 if (*el1++ == '\n')     /* EOLN on both strings */
00770                         return SAME;
00771                 el2++;
00772         }
00773         if (*el1 == '\n')       /* EOLN on string one */
00774                 return LOWER;
00775         if (*el2 == '\n') return HIGHER;
00776         if (field->ascii) {     /* Skip chars outside 040 - 0177 */
00777                 if ((table[*el1] & ASCII) == 0) {
00778                         do {
00779                                 el1++;
00780                         } while ((table[*el1] & ASCII) == 0);
00781                         continue;
00782                 }
00783                 if ((table[*el2] & ASCII) == 0) {
00784                         do {
00785                                 el2++;
00786                         } while ((table[*el2] & ASCII) == 0);
00787                         continue;
00788                 }
00789         }
00790         if (field->dictionary) {/* Skip non-dict chars */
00791                 if ((table[*el1] & DICT) == 0) {
00792                         do {
00793                                 el1++;
00794                         } while ((table[*el1] & DICT) == 0);
00795                         continue;
00796                 }
00797                 if ((table[*el2] & DICT) == 0) {
00798                         do {
00799                                 el2++;
00800                         } while ((table[*el2] & DICT) == 0);
00801                         continue;
00802                 }
00803         }
00804         if (field->fold_case) { /* Fold upper case to lower */
00805                 if (table[c1 = *el1++] & UPPER) c1 += 'a' - 'A';
00806                 if (table[c2 = *el2++] & UPPER) c2 += 'a' - 'A';
00807                 if (c1 == c2) continue;
00808                 return c1 - c2;
00809         }
00810         return *el1 - *el2;
00811   }
00812 
00813   /* NOTREACHED */
00814 }
00815 
00816 /*
00817  * Digits compares () the two strings that point to a number of digits followed
00818  * by an optional decimal point.
00819  */
00820 int digits(str1, str2, check_sign)
00821 register char *str1, *str2;
00822 BOOL check_sign;                /* True if sign must be checked */
00823 {
00824   BOOL negative = FALSE;        /* True if negative numbers */
00825   int diff, pow, ret;
00826 
00827 /* Check for optional minus or plus sign */
00828   if (check_sign) {
00829         if (*str1 == '-') {
00830                 negative = TRUE;
00831                 str1++;
00832         } else if (*str1 == '+')
00833                 str1++;
00834 
00835         if (*str2 == '-') {
00836                 if (negative == FALSE) return HIGHER;
00837                 str2++;
00838         } else if (negative)
00839                 return LOWER;
00840         else if (*str2 == '+')
00841                 str2++;
00842   }
00843 
00844 /* Keep incrementing as long as digits are available and equal */
00845   while ((table[*str1] & DIGIT) && table[*str2] & DIGIT) {
00846         if (*str1 != *str2) break;
00847         str1++;
00848         str2++;
00849   }
00850 
00851 /* First check for the decimal point. */
00852   if (*str1 == '.' || *str2 == '.') {
00853         if (*str1 == '.') {
00854                 if (*str2 == '.')       /* Both. Check decimal part */
00855                         ret = digits(str1 + 1, str2 + 1, FALSE);
00856                 else
00857                         ret = (table[*str2] & DIGIT) ? LOWER : HIGHER;
00858         } else
00859                 ret = (table[*str1] & DIGIT) ? HIGHER : LOWER;
00860   }
00861 
00862 /* Now either two digits differ, or unknown char is seen (e.g. end of string) */
00863   else if ((table[*str1] & DIGIT) && (table[*str2] & DIGIT)) {
00864         diff = *str1 - *str2;   /* Basic difference */
00865         pow = 0;                /* Check power of numbers */
00866         while (table[*str1++] & DIGIT) pow++;
00867         while (table[*str2++] & DIGIT) pow--;
00868         ret = (pow == 0) ? diff : pow;
00869   }
00870 
00871 /* Unknown char. Check on which string it occurred */
00872   else {
00873         if ((table[*str1] & DIGIT) == 0)
00874                 ret = (table[*str2] & DIGIT) ? LOWER : SAME;
00875         else
00876                 ret = HIGHER;
00877   }
00878 
00879 /* Reverse sense of comparisons if negative is true. (-1000 < -1) */
00880   return(negative) ? -ret : ret;
00881 }
00882 
00883 /* Files_merge () merges all files as indicated by nr_of_files. Merging goes
00884  * in numbers of files that can be opened at the same time. (OPEN_FILES)
00885  */
00886 void files_merge(file_cnt)
00887 register int file_cnt;          /* Nr_of_files to merge */
00888 {
00889   register int i;
00890   int limit;
00891 
00892   for (i = 0; i < file_cnt; i += OPEN_FILES) {
00893         /* Merge last files and store in output file */
00894         if ((limit = i + OPEN_FILES) >= file_cnt) {
00895                 open_outfile();
00896                 limit = file_cnt;
00897         } else {                /* Merge OPEN_FILES files and store in temp
00898                          * file */
00899                 temp_files[16] = file_cnt / 26 + 'a';
00900                 temp_files[17] = file_cnt % 26 + 'a';
00901                 if ((out_fd = creat(temp_files, 0644)) < 0)
00902                         error(TRUE, "Cannot creat ", temp_files);
00903                 file_cnt++;
00904         }
00905         merge(i, limit);
00906   }
00907 
00908 /* Cleanup mess */
00909   i = (only_merge) ? args_limit - args_offset : 0;
00910   while (i < file_cnt) (void) unlink(file_name(i++));
00911 }
00912 
00913 /* Merge () merges the files between start_file and limit_file. */
00914 void merge(start_file, limit_file)
00915 int start_file, limit_file;
00916 {
00917   register MERGE *smallest;     /* Keeps track of smallest line */
00918   register int i;
00919   int file_cnt = limit_file - start_file;       /* Nr of files to merge */
00920 
00921 /* Calculate size in core available for file_cnt merge structs */
00922   buf_size = MEMORY_SIZE / file_cnt - LINE_SIZE;
00923 
00924   mbrk(mem_top);                /* First reset mem to lowest loc. */
00925   disabled = 0;                 /* All files not done yet */
00926 
00927 /* Set up merge structures. */
00928   for (i = start_file; i < limit_file; i++) {
00929         smallest = &merge_f[i - start_file];
00930         if (!strcmp(file_name(i), "-")) /* File is stdin */
00931                 smallest->fd = 0;
00932         else if ((smallest->fd = open(file_name(i), O_RDONLY)) < 0) {
00933                 smallest->fd = ERROR;
00934                 error(FALSE, "Cannot open ", file_name(i));
00935                 disabled++;     /* Done this file */
00936                 continue;
00937         }
00938         smallest->buffer = msbrk(buf_size);
00939         smallest->line = msbrk(LINE_SIZE);
00940         smallest->cnt = smallest->read_chars = 0;
00941         (void) read_line(smallest);     /* Read first line */
00942   }
00943 
00944   if (disabled == file_cnt) {   /* Couldn't open files */
00945         (void) close(out_fd);
00946         return;
00947   }
00948 
00949 /* Find a merg struct to assign smallest. */
00950   for (i = 0; i < file_cnt; i++) {
00951         if (merge_f[i].fd != ERROR) {
00952                 smallest = &merge_f[i];
00953                 break;
00954         }
00955   }
00956 
00957 /* Loop until all files minus one are done */
00958   while (disabled < file_cnt - 1) {
00959         if (uniq)               /* Skip all same lines */
00960                 smallest = skip_lines(smallest, file_cnt);
00961         else {                  /* Find smallest line */
00962                 for (i = 0; i < file_cnt; i++) {
00963                         if (merge_f[i].fd == ERROR)
00964                                 continue;       /* We've had this one */
00965                         if (compare(merge_f[i].line, smallest->line) < 0)
00966                                 smallest = &merge_f[i];
00967                 }
00968         }                       /* Print line and read next */
00969         smallest = print(smallest, file_cnt);
00970   }
00971 
00972   if (only_merge && uniq)
00973         uniq_lines(smallest);   /* Print only uniq lines */
00974   else                          /* Print rest of file */
00975         while (print(smallest, file_cnt) != NIL_MERGE);
00976 
00977   put_line(NIL_PTR);            /* Flush output buffer */
00978 }
00979 
00980 /* Put_line () prints the line into the out_fd filedescriptor. If line equals
00981  * NIL_PTR, the out_fd is flushed and closed.
00982  */
00983 void put_line(line)
00984 register char *line;
00985 {
00986   static int index = 0;         /* Index in out_buffer */
00987 
00988   if (line == NIL_PTR) {        /* Flush and close */
00989         mwrite(out_fd, out_buffer, index);
00990         index = 0;
00991         (void) close(out_fd);
00992         return;
00993   }
00994   do {                          /* Fill out_buffer with line */
00995         out_buffer[index++] = *line;
00996         if (index == IO_SIZE) {
00997                 mwrite(out_fd, out_buffer, IO_SIZE);
00998                 index = 0;
00999         }
01000   } while (*line++ != '\n');
01001 }
01002 
01003 /*
01004  * Print () prints the line of the merg structure and tries to read another one.
01005  * If this fails, it returns the next merg structure which file_descriptor is
01006  * still open. If none could be found, a NIL structure is returned.
01007  */
01008 MERGE *print(merg, file_cnt)
01009 register MERGE *merg;
01010 int file_cnt;                   /* Nr of files that are being merged */
01011 {
01012   register int i;
01013 
01014   put_line(merg->line);         /* Print the line */
01015 
01016   if (read_line(merg) == ERROR) {       /* Read next line */
01017         for (i = 0; i < file_cnt; i++) {
01018                 if (merge_f[i].fd != ERROR) {
01019                         merg = &merge_f[i];
01020                         break;
01021                 }
01022         }
01023         if (i == file_cnt)      /* No more files left */
01024                 return NIL_MERGE;
01025   }
01026   return merg;
01027 }
01028 
01029 /* Read_line () reads a line from the fd from the merg struct. If the read
01030  * failed, disabled is incremented and the file is closed. Readings are
01031  * done in buf_size bytes.
01032  * Lines longer than LINE_SIZE are silently truncated.
01033  */
01034 int read_line(merg)
01035 register MERGE *merg;
01036 {
01037   register char *ptr = merg->line - 1;  /* Ptr buf that will hold line */
01038 
01039   do {
01040         ptr++;
01041         if (merg->cnt == merg->read_chars) {    /* Read new buffer */
01042                 if ((merg->read_chars =
01043                      read(merg->fd, merg->buffer, buf_size)) <= 0) {
01044                         (void) close(merg->fd); /* OOPS */
01045                         merg->fd = ERROR;
01046                         disabled++;
01047                         return ERROR;
01048                 }
01049                 merg->cnt = 0;
01050         }
01051         *ptr = merg->buffer[merg->cnt++];       /* Assign next char of line */
01052         if (ptr - merg->line == LINE_SIZE - 1)
01053                 *ptr = '\n';    /* Truncate very long lines */
01054   } while (*ptr != '\n' && *ptr != '\0');
01055 
01056   if (*ptr == '\0')             /* Add '\n' to last line */
01057         *ptr = '\n';
01058   *++ptr = '\0';                /* Add '\0' */
01059   return OK;
01060 }
01061 
01062 /* Skip_lines () skips all same lines in all the files currently being merged.
01063  * It returns a pointer to the merge struct containing the smallest line.
01064  */
01065 MERGE *skip_lines(smallest, file_cnt)
01066 register MERGE *smallest;
01067 int file_cnt;
01068 {
01069   register int i;
01070   int ret;
01071 
01072   if (disabled == file_cnt - 1) /* We've had all */
01073         return smallest;
01074 
01075   for (i = 0; i < file_cnt; i++) {
01076         if (merge_f[i].fd == ERROR || smallest == &merge_f[i])
01077                 continue;       /* Don't check same file */
01078         while ((ret = compare(merge_f[i].line, smallest->line)) == 0) {
01079                 if (read_line(&merge_f[i]) == ERROR) break;     /* EOF */
01080         }
01081         if (ret < 0)            /* Line wasn't smallest. Try again */
01082                 return skip_lines(&merge_f[i], file_cnt);
01083   }
01084   return smallest;
01085 }
01086 
01087 /* Uniq_lines () prints only the uniq lines out of the fd of the merg struct. */
01088 void uniq_lines(merg)
01089 register MERGE *merg;
01090 {
01091   char lastline[LINE_SIZE];     /* Buffer to hold last line */
01092 
01093   for (;;) {
01094         put_line(merg->line);   /* Print this line */
01095         copy(lastline, merg->line);     /* and save it */
01096         if (read_line(merg) == ERROR)   /* Read the next */
01097                 return;
01098         /* Keep reading until lines duffer */
01099         while (compare(lastline, merg->line) == SAME)
01100                 if (read_line(merg) == ERROR) return;
01101   }
01102 
01103   /* NOTREACHED */
01104 }
01105 
01106 /*
01107  * Check_file () checks if a file is sorted in order according to the arguments
01108  * given in main ().
01109  */
01110 void check_file(fd, file)
01111 int fd;
01112 char *file;
01113 {
01114   register MERGE *merg;         /* 1 file only */
01115   char lastline[LINE_SIZE];     /* Save last line */
01116   register int ret;             /* ret status of compare */
01117 
01118   if (fd == 0) file = "stdin";
01119   merg = (MERGE *) mem_top;     /* Assign MERGE structure */
01120   merg->buffer = mem_top + sizeof(MERGE);
01121   merg->line = msbrk(LINE_SIZE);
01122   merg->cnt = merg->read_chars = 0;
01123   merg->fd = fd;
01124   buf_size = MEMORY_SIZE - sizeof(MERGE);
01125 
01126   if (read_line(merg) == ERROR) /* Read first line */
01127         return;
01128   copy(lastline, merg->line);   /* and save it */
01129 
01130   for (;;) {
01131         if (read_line(merg) == ERROR)   /* EOF reached */
01132                 break;
01133         if ((ret = compare(lastline, merg->line)) > 0) {
01134                 error(FALSE, "Disorder in file ", file);
01135                 write(2, merg->line, length(merg->line));
01136                 break;
01137         } else if (ret < 0)     /* Copy if lines not equal */
01138                 copy(lastline, merg->line);
01139         else if (uniq) {
01140                 error(FALSE, "Non uniq line in file ", file);
01141                 write(2, merg->line, length(merg->line));
01142                 break;
01143         }
01144   }
01145 
01146   mbrk(mem_top);                /* Reset mem */
01147 }
01148 
01149 /* Length () returns the length of the argument line including the linefeed. */
01150 int length(line)
01151 register char *line;
01152 {
01153   register int i = 1;           /* Add linefeed */
01154 
01155   while (*line++ != '\n') i++;
01156   return i;
01157 }
01158 
01159 /* Copy () copies the src line into the dest line including linefeed. */
01160 void copy(dest, src)
01161 register char *dest, *src;
01162 {
01163   while ((*dest++ = *src++) != '\n');
01164 }
01165 
01166 /* Msbrk() does a sbrk() and checks the return value. */
01167 char *msbrk(size)
01168 register int size;
01169 {
01170   register char *address;
01171 
01172   if ((address = sbrk(size)) == (char *) -1)
01173         error(TRUE, "Not enough memory. Use chmem to allocate more", NIL_PTR);
01174   return address;
01175 }
01176 
01177 /* Mbrk() does a brk() and checks the return value. */
01178 void mbrk(address)
01179 char *address;
01180 {
01181   if (brk(address) == -1) error(TRUE, "Cannot reset memory", NIL_PTR);
01182 }
01183 
01184 void catch(dummy)
01185 int dummy;                      /* to satisfy the prototype */
01186 {
01187   register int i;
01188 
01189   signal(SIGINT, SIG_IGN);
01190   only_merge = FALSE;
01191   for (i = 0; i < 26; i++) (void) unlink(file_name(i));
01192   exit(2);
01193 }

Generated on Fri Apr 14 22:57:11 2006 for minix by  doxygen 1.4.6