From 2180ecc81b5f7635adbe5412010642e15fa212d3 Mon Sep 17 00:00:00 2001 From: Karel Zak Date: Thu, 4 Feb 2021 10:42:53 +0100 Subject: [PATCH] hardlink: replace with code from Debian The current version used in util-linux is based on original code from Jakub Jelinek. The new version is based on Debian implementation from https://salsa.debian.org/jak/hardlink. This new version uses nftw() to walk on directories tree and organize internal data binary tree (tsearch() and twalk()). This new version provides more features like --ignore-{mode,owner,time}, --respect-xattrs, --respect-name, --include, --keep-oldest, --minimize, --maximize, etc. Note that the new version uses -f for --respect-name, the old version uses -f to hardlinking across filesystems (very probably rarely unused feature). Addresses: https://github.com/karelzak/util-linux/issues/808 Signed-off-by: Karel Zak --- misc-utils/hardlink.1 | 131 ++-- misc-utils/hardlink.c | 1714 ++++++++++++++++++++++++++++------------- 2 files changed, 1273 insertions(+), 572 deletions(-) diff --git a/misc-utils/hardlink.1 b/misc-utils/hardlink.1 index 21cc7e8c4..dbb0b6372 100644 --- a/misc-utils/hardlink.1 +++ b/misc-utils/hardlink.1 @@ -1,69 +1,88 @@ -.TH "hardlink" "1" +.\" Copyright (C) 2008 - 2012 Julian Andres Klode. See hardlink.c for license. +.\" SPDX-License-Identifier: MIT +.TH hardlink 1 "2012-09-17" "0.3" .SH NAME -hardlink \- Consolidate duplicate files via hardlinks +hardlink \- Link multiple copies of a file .SH SYNOPSIS .B hardlink -[options] -.RI [ directory ...] +.RI [ option ]... +.RI [ directory | file ]... .SH DESCRIPTION -This manual page documents \fBhardlink\fR, a -program which consolidates duplicate files in one or more directories -using hardlinks. -.PP -\fBhardlink\fR traverses one -or more directories searching for duplicate files. When it finds duplicate -files, it uses one of them as the master. It then removes all other -duplicates and places a hardlink for each one pointing to the master file. -This allows for conservation of disk space where multiple directories -on a single filesystem contain many duplicate files. -.PP -Since hard links can only span a single filesystem, \fBhardlink\fR -is only useful when all directories specified are on the same filesystem. +.B hardlink +is a tool which replaces copies of a file with hardlinks, therefore saving +space. .SH OPTIONS .TP -.BR \-c , " \-\-content" -Compare only the contents of the files being considered for consolidation. -Disregards permission, ownership and other differences. +.B \-h or \-\-help +print quick usage details to the screen. .TP -.BR \-f , " \-\-force" -Force hardlinking across file systems. +.B \-v or \-\-verbose +More verbose output. If specified once, every hardlinked file is displayed, +if specified twice, it also shows every comparison. .TP -.BR \-n , " \-\-dry\-run" -Do not perform the consolidation; only print what would be changed. +.B \-n or \-\-dry\-run +Do not act, just print what would happen .TP -.BR \-v , " \-\-verbose" -Print summary after hardlinking. The option may be specified more than once. In -this case (e.g., \fB\-vv\fR) it prints every hardlinked file and bytes saved. +.B \-f or \-\-respect\-name +Only try to link files with the same (basename). .TP -.BR \-x , " \-\-exclude " \fIregex\fR -Exclude files and directories matching pattern from hardlinking. -.sp -The optional pattern for excluding files and directories must be a PCRE2 -compatible regular expression. Only the basename of the file or directory -is checked, not its path. Excluded directories' contents will not be examined. +.B \-p or \-\-ignore\-mode +Link/compare files even if their mode is different. This may be a bit unpredictable. .TP -.BR \-h , " \-\-help" -Display help text and exit. +.B \-o or \-\-ignore\-owner +Link/compare files even if their owner (user and group) is different. It is not +predictable .TP -.BR \-V , " \-\-version" -Display version information and exit. +.B \-t or \-\-ignore\-time +Link/compare files even if their time of modification is different. You almost +always want this. +.TP +.B \-X or \-\-respect\-xattrs +Only try to link files with the same extended attributes. +.TP +.B \-m or \-\-maximize +Among equal files, keep the file with the highest link count. +.TP +.B \-M or \-\-minimize +Among equal files, keep the file with the lowest link count. +.TP +.B \-O or \-\-keep\-oldest +Among equal files, keep the oldest file (least recent modification time). By +default, the newest file is kept. If \-\-maximize or \-\-minimize is specified, +the link count has a higher precedence than the time of modification. +.TP +.B \-x or \-\-exclude +A regular expression which excludes files from being compared and linked. +.TP +.B \-i or \-\-include +A regular expression to include files. If the option \-\-exclude has been given, +this option re-includes files which would otherwise be excluded. If the option +is used without \-\-exclude, only files matched by the pattern are included. +.TP +.B \-s or \-\-minimum\-size +The minimum size to consider. By default this is 1, so empty files will not +be linked. An optional suffix of K,M,G,T may be provided, indicating that the +file size is KiB,MiB,GiB,TiB. + +.SH ARGUMENTS +.B hardlink +takes one or more directories which will be searched for files to be linked. + .SH BUGS -\fBhardlink\fR assumes that its target directory trees do not change from under -it. If a directory tree does change, this may result in \fBhardlink\fR -accessing files and/or directories outside of the intended directory tree. -Thus, you must avoid running \fBhardlink\fR on potentially changing directory -trees, and especially on directory trees under control of another user. -.PP -Historically \fBhardlink\fR silently excluded any names beginning with -".in.", as well as any names beginning with "." followed by exactly 6 -other characters. That prior behavior can be achieved by specifying -.br -\-x '\(ha(\\.in\\.|\\.[\(ha.]{6}$)' -.SH AUTHORS -\fBhardlink\fR was written by Jakub Jelinek and later modified by -Ruediger Meier and Karel Zak for util-linux. -.PP -Man page written by Brian Long and later updated by Jindrich Novy -.SH AVAILABILITY -The hardlink command is part of the util-linux package and is available from -https://www.kernel.org/pub/linux/utils/util-linux/. +.B hardlink +assumes that the trees it operates on do not change during +operation. If a tree does change, the result is undefined and potentially +dangerous. For example, if a regular file is replaced by a device, hardlink +may start reading from the device. If a component of a path is replaced by +a symbolic link or file permissions change, security may be compromised. Do +not run hardlink on a changing tree or on a tree controlled by another user. + +.B hardlink +, as of version 0.3 RC1, improperly calculates the amount of space saved if the +option \-\-respect\-name is specified. In previous versions, the amount was +wrong in almost all other cases as well. + +.SH AUTHOR +The program hardlink and this manpage have been written by Julian Andres Klode, +and are licensed under the MIT license. See the code of hardlink for further +information. diff --git a/misc-utils/hardlink.c b/misc-utils/hardlink.c index 2711b2a7a..11bf55051 100644 --- a/misc-utils/hardlink.c +++ b/misc-utils/hardlink.c @@ -1,531 +1,1213 @@ -/* - * hardlink - consolidate duplicate files via hardlinks +/* hardlink.c - Link multiple identical files together * - * Copyright (C) 2018 Red Hat, Inc. All rights reserved. - * Written by Jakub Jelinek + * Copyright (C) 2008 - 2014 Julian Andres Klode * - * Copyright (C) 2019 Karel Zak + * SPDX-License-Identifier: MIT * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. * - * You should have received a copy of the GNU General Public License along - * with this program; if not, write to the Free Software Foundation, Inc., - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. */ -#include -#include + +#define _GNU_SOURCE /* GNU extensions (optional) */ +#define _POSIX_C_SOURCE 200112L /* POSIX functions */ +#define _XOPEN_SOURCE 600 /* nftw() */ + +#define _FILE_OFFSET_BITS 64 /* Large file support */ +#define _LARGEFILE_SOURCE /* Large file support */ +#define _LARGE_FILES /* AIX apparently */ + +#include /* stat */ +#include /* stat */ +#include /* getrlimit, getrusage */ +#include /* getrlimit, getrusage */ +#include /* stat */ +#include /* posix_fadvise */ +#include /* ftw */ +#include /* tsearch() and friends */ + +#include /* strerror, errno */ +#include /* setlocale */ +#include /* SIG*, sigaction */ +#include /* stderr, fprint */ +#include /* va_arg */ +#include /* free(), realloc() */ +#include /* strcmp() and friends */ +#include /* assert() */ +#include /* tolower() */ + +/* Some boolean names for clarity */ +typedef enum hl_bool { + FALSE, + TRUE +} hl_bool; + +/* The makefile sets this for us and creates config.h */ +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* We don't have getopt_long(). Define no-op alternatives */ +#ifdef HAVE_GETOPT_LONG #include -#include -#include -#include -#include -#include -#include -#include -#include -#ifdef HAVE_PCRE -# define PCRE2_CODE_UNIT_WIDTH 8 -# include -#endif - -#include "c.h" -#include "xalloc.h" -#include "nls.h" -#include "closestream.h" - -#define NHASH (1<<17) /* Must be a power of 2! */ -#define NBUF 64 - -struct hardlink_file; - -struct hardlink_hash { - struct hardlink_hash *next; - struct hardlink_file *chain; - off_t size; - time_t mtime; -}; - -struct hardlink_dir { - struct hardlink_dir *next; - char name[]; -}; - -struct hardlink_file { - struct hardlink_file *next; - ino_t ino; - dev_t dev; - unsigned int cksum; - char name[]; -}; - -struct hardlink_dynstr { - char *buf; - size_t alloc; -}; - -struct hardlink_ctl { - struct hardlink_dir *dirs; - struct hardlink_hash *hps[NHASH]; - char iobuf1[BUFSIZ]; - char iobuf2[BUFSIZ]; - /* summary counters */ - unsigned long long ndirs; - unsigned long long nobjects; - unsigned long long nregfiles; - unsigned long long ncomp; - unsigned long long nlinks; - unsigned long long nsaved; - /* current device */ - dev_t dev; - /* flags */ - unsigned int verbose; - unsigned int - no_link:1, - content_only:1, - force:1; -}; -/* ctl is in global scope due use in atexit() */ -struct hardlink_ctl global_ctl; - -__attribute__ ((always_inline)) -static inline unsigned int hash(off_t size, time_t mtime) -{ - return (size ^ mtime) & (NHASH - 1); -} - -__attribute__ ((always_inline)) -static inline int stcmp(struct stat *st1, struct stat *st2, int content_scope) -{ - if (content_scope) - return st1->st_size != st2->st_size; - - return st1->st_mode != st2->st_mode - || st1->st_uid != st2->st_uid - || st1->st_gid != st2->st_gid - || st1->st_size != st2->st_size - || st1->st_mtime != st2->st_mtime; -} - -static void print_summary(void) -{ - struct hardlink_ctl const *const ctl = &global_ctl; - - if (!ctl->verbose) - return; - - if (ctl->verbose > 1 && ctl->nlinks) - fputc('\n', stdout); - - printf(_("Directories: %9lld\n"), ctl->ndirs); - printf(_("Objects: %9lld\n"), ctl->nobjects); - printf(_("Regular files: %9lld\n"), ctl->nregfiles); - printf(_("Comparisons: %9lld\n"), ctl->ncomp); - printf( "%s%9lld\n", (ctl->no_link ? - _("Would link: ") : - _("Linked: ")), ctl->nlinks); - printf( "%s %9lld\n", (ctl->no_link ? - _("Would save: ") : - _("Saved: ")), ctl->nsaved); -} - -static void __attribute__((__noreturn__)) usage(void) -{ - fputs(USAGE_HEADER, stdout); - printf(_(" %s [options] directory...\n"), program_invocation_short_name); - - fputs(USAGE_SEPARATOR, stdout); - puts(_("Consolidate duplicate files using hardlinks.")); - - fputs(USAGE_OPTIONS, stdout); - puts(_(" -c, --content compare only contents, ignore permission, etc.")); - puts(_(" -n, --dry-run don't actually link anything")); - puts(_(" -v, --verbose print summary after hardlinking")); - puts(_(" -vv print every hardlinked file and summary")); - puts(_(" -f, --force force hardlinking across filesystems")); - puts(_(" -x, --exclude exclude files matching pattern")); - - fputs(USAGE_SEPARATOR, stdout); - printf(USAGE_HELP_OPTIONS(16)); /* char offset to align option descriptions */ - printf(USAGE_MAN_TAIL("hardlink(1)")); - exit(EXIT_SUCCESS); -} - -__attribute__ ((always_inline)) -static inline size_t add2(size_t a, size_t b) -{ - size_t sum = a + b; - - if (sum < a) - errx(EXIT_FAILURE, _("integer overflow")); - return sum; -} - -__attribute__ ((always_inline)) -static inline size_t add3(size_t a, size_t b, size_t c) -{ - return add2(add2(a, b), c); -} - -static void growstr(struct hardlink_dynstr *str, size_t newlen) -{ - if (newlen < str->alloc) - return; - str->buf = xrealloc(str->buf, str->alloc = add2(newlen, 1)); -} - -static void process_path(struct hardlink_ctl *ctl, const char *name) -{ - struct stat st, st2, st3; - const size_t namelen = strlen(name); - - ctl->nobjects++; - if (lstat(name, &st)) - return; - - if (st.st_dev != ctl->dev && !ctl->force) { - if (ctl->dev) - errx(EXIT_FAILURE, - _("%s is on different filesystem than the rest " - "(use -f option to override)."), name); - ctl->dev = st.st_dev; - } - if (S_ISDIR(st.st_mode)) { - struct hardlink_dir *dp = xmalloc(add3(sizeof(*dp), namelen, 1)); - memcpy(dp->name, name, namelen + 1); - dp->next = ctl->dirs; - ctl->dirs = dp; - - } else if (S_ISREG(st.st_mode)) { - int fd, i; - struct hardlink_file *fp, *fp2; - struct hardlink_hash *hp; - const char *n1, *n2; - unsigned int buf[NBUF]; - int cksumsize = sizeof(buf); - unsigned int cksum; - time_t mtime = ctl->content_only ? 0 : st.st_mtime; - unsigned int hsh = hash(st.st_size, mtime); - off_t fsize; - - ctl->nregfiles++; - if (ctl->verbose > 1) - printf("%s\n", name); - - fd = open(name, O_RDONLY); - if (fd < 0) - return; - - if ((size_t)st.st_size < sizeof(buf)) { - cksumsize = st.st_size; - memset(((char *)buf) + cksumsize, 0, - (sizeof(buf) - cksumsize) % sizeof(buf[0])); - } - if (read(fd, buf, cksumsize) != cksumsize) { - close(fd); - return; - } - cksumsize = (cksumsize + sizeof(buf[0]) - 1) / sizeof(buf[0]); - for (i = 0, cksum = 0; i < cksumsize; i++) { - if (cksum + buf[i] < cksum) - cksum += buf[i] + 1; - else - cksum += buf[i]; - } - for (hp = ctl->hps[hsh]; hp; hp = hp->next) { - if (hp->size == st.st_size && hp->mtime == mtime) - break; - } - if (!hp) { - hp = xmalloc(sizeof(*hp)); - hp->size = st.st_size; - hp->mtime = mtime; - hp->chain = NULL; - hp->next = ctl->hps[hsh]; - ctl->hps[hsh] = hp; - } - for (fp = hp->chain; fp; fp = fp->next) { - if (fp->cksum == cksum) - break; - } - for (fp2 = fp; fp2 && fp2->cksum == cksum; fp2 = fp2->next) { - if (fp2->ino == st.st_ino && fp2->dev == st.st_dev) { - close(fd); - return; - } - } - for (fp2 = fp; fp2 && fp2->cksum == cksum; fp2 = fp2->next) { - - if (!lstat(fp2->name, &st2) && S_ISREG(st2.st_mode) && - !stcmp(&st, &st2, ctl->content_only) && - st2.st_ino != st.st_ino && - st2.st_dev == st.st_dev) { - - int fd2 = open(fp2->name, O_RDONLY); - if (fd2 < 0) - continue; - - if (fstat(fd2, &st2) || !S_ISREG(st2.st_mode) - || st2.st_size == 0) { - close(fd2); - continue; - } - ctl->ncomp++; - lseek(fd, 0, SEEK_SET); - - for (fsize = st.st_size; fsize > 0; - fsize -= (off_t)sizeof(ctl->iobuf1)) { - ssize_t xsz; - ssize_t rsize = fsize > (ssize_t) sizeof(ctl->iobuf1) ? - (ssize_t) sizeof(ctl->iobuf1) : fsize; - - if ((xsz = read(fd, ctl->iobuf1, rsize)) != rsize) - warn(_("cannot read %s"), name); - else if ((xsz = read(fd2, ctl->iobuf2, rsize)) != rsize) - warn(_("cannot read %s"), fp2->name); - - if (xsz != rsize) { - close(fd); - close(fd2); - return; - } - if (memcmp(ctl->iobuf1, ctl->iobuf2, rsize) != 0) - break; - } - close(fd2); - if (fsize > 0) - continue; - if (lstat(name, &st3)) { - warn(_("cannot stat %s"), name); - close(fd); - return; - } - st3.st_atime = st.st_atime; - if (stcmp(&st, &st3, 0)) { - warnx(_("file %s changed underneath us"), name); - close(fd); - return; - } - n1 = fp2->name; - n2 = name; - - if (!ctl->no_link) { - const char *suffix = - ".$$$___cleanit___$$$"; - const size_t suffixlen = strlen(suffix); - size_t n2len = strlen(n2); - struct hardlink_dynstr nam2 = { NULL, 0 }; - - growstr(&nam2, add2(n2len, suffixlen)); - memcpy(nam2.buf, n2, n2len); - memcpy(&nam2.buf[n2len], suffix, - suffixlen + 1); - /* First create a temporary link to n1 under a new name */ - if (link(n1, nam2.buf)) { - warn(_("failed to hardlink %s to %s (create temporary link as %s failed)"), - n1, n2, nam2.buf); - free(nam2.buf); - continue; - } - /* Then rename into place over the existing n2 */ - if (rename(nam2.buf, n2)) { - warn(_("failed to hardlink %s to %s (rename temporary link to %s failed)"), - n1, n2, n2); - /* Something went wrong, try to remove the now redundant temporary link */ - if (unlink(nam2.buf)) - warn(_("failed to remove temporary link %s"), nam2.buf); - free(nam2.buf); - continue; - } - free(nam2.buf); - } - ctl->nlinks++; - if (st3.st_nlink > 1) { - /* We actually did not save anything this time, since the link second argument - had some other links as well. */ - if (ctl->verbose > 1) - printf(_(" %s %s to %s\n"), - (ctl->no_link ? _("Would link") : _("Linked")), - n1, n2); - } else { - ctl->nsaved += ((st.st_size + 4095) / 4096) * 4096; - if (ctl->verbose > 1) - printf(_(" %s %s to %s, %s %jd\n"), - (ctl->no_link ? _("Would link") : _("Linked")), - n1, n2, - (ctl->no_link ? _("would save") : _("saved")), - (intmax_t)st.st_size); - } - close(fd); - return; - } - } - fp2 = xmalloc(add3(sizeof(*fp2), namelen, 1)); - close(fd); - fp2->ino = st.st_ino; - fp2->dev = st.st_dev; - fp2->cksum = cksum; - memcpy(fp2->name, name, namelen + 1); - - if (fp) { - fp2->next = fp->next; - fp->next = fp2; - } else { - fp2->next = hp->chain; - hp->chain = fp2; - } - return; - } -} - -int main(int argc, char **argv) -{ - int ch; - int i; -#ifdef HAVE_PCRE - int errornumber; - PCRE2_SIZE erroroffset; - pcre2_code *re = NULL; - PCRE2_SPTR exclude_pattern = NULL; - pcre2_match_data *match_data = NULL; -#endif - struct hardlink_dynstr nam1 = { NULL, 0 }; - struct hardlink_ctl *ctl = &global_ctl; - - static const struct option longopts[] = { - { "content", no_argument, NULL, 'c' }, - { "dry-run", no_argument, NULL, 'n' }, - { "exclude", required_argument, NULL, 'x' }, - { "force", no_argument, NULL, 'f' }, - { "help", no_argument, NULL, 'h' }, - { "verbose", no_argument, NULL, 'v' }, - { "version", no_argument, NULL, 'V' }, - { NULL, 0, NULL, 0 }, - }; - - setlocale(LC_ALL, ""); - bindtextdomain(PACKAGE, LOCALEDIR); - textdomain(PACKAGE); - close_stdout_atexit(); - - while ((ch = getopt_long(argc, argv, "cnvfx:Vh", longopts, NULL)) != -1) { - switch (ch) { - case 'n': - ctl->no_link = 1; - break; - case 'v': - ctl->verbose++; - break; - case 'c': - ctl->content_only = 1; - break; - case 'f': - ctl->force = 1; - break; - case 'x': -#ifdef HAVE_PCRE - exclude_pattern = (PCRE2_SPTR) optarg; #else - errx(EXIT_FAILURE, - _("option --exclude not supported (built without pcre2)")); +#define getopt_long(argc, argv, shrt, lng, index) getopt((argc), (argv), (shrt)) #endif - break; - case 'V': - print_version(EXIT_SUCCESS); - case 'h': - usage(); - default: - errtryhelp(EXIT_FAILURE); - } - } - if (optind == argc) { - warnx(_("no directory specified")); - errtryhelp(EXIT_FAILURE); - } - -#ifdef HAVE_PCRE - if (exclude_pattern) { - re = pcre2_compile(exclude_pattern, /* the pattern */ - PCRE2_ZERO_TERMINATED, /* indicates pattern is zero-terminate */ - 0, /* default options */ - &errornumber, &erroroffset, NULL); /* use default compile context */ - if (!re) { - PCRE2_UCHAR buffer[256]; - pcre2_get_error_message(errornumber, buffer, - sizeof(buffer)); - errx(EXIT_FAILURE, _("pattern error at offset %d: %s"), - (int)erroroffset, buffer); - } - match_data = pcre2_match_data_create_from_pattern(re, NULL); - } +/* For systems without posix_fadvise */ +#ifndef HAVE_POSIX_FADVISE +#define posix_fadvise(fd, offset, len, advise) (void) 0 #endif - atexit(print_summary); - for (i = optind; i < argc; i++) - process_path(ctl, argv[i]); - - while (ctl->dirs) { - DIR *dh; - struct dirent *di; - struct hardlink_dir *dp = ctl->dirs; - size_t nam1baselen = strlen(dp->name); - - ctl->dirs = dp->next; - growstr(&nam1, add2(nam1baselen, 1)); - memcpy(nam1.buf, dp->name, nam1baselen); - free(dp); - nam1.buf[nam1baselen++] = '/'; - nam1.buf[nam1baselen] = 0; - dh = opendir(nam1.buf); - - if (dh == NULL) - continue; - ctl->ndirs++; - - while ((di = readdir(dh)) != NULL) { - if (!di->d_name[0]) - continue; - if (di->d_name[0] == '.') { - if (!di->d_name[1] || !strcmp(di->d_name, "..")) - continue; - } -#ifdef HAVE_PCRE - if (re && pcre2_match(re, /* compiled regex */ - (PCRE2_SPTR) di->d_name, strlen(di->d_name), 0, /* start at offset 0 */ - 0, /* default options */ - match_data, /* block for storing the result */ - NULL) /* use default match context */ - >=0) { - if (ctl->verbose) { - nam1.buf[nam1baselen] = 0; - printf(_("Skipping %s%s\n"), nam1.buf, di->d_name); - } - continue; - } +/* __attribute__ is fairly GNU-specific, define a no-op alternative elsewhere */ +#ifndef __GNUC__ +#define __attribute__(attributes) #endif - { - size_t subdirlen; - growstr(&nam1, - add2(nam1baselen, subdirlen = - strlen(di->d_name))); - memcpy(&nam1.buf[nam1baselen], di->d_name, - add2(subdirlen, 1)); - } - process_path(ctl, nam1.buf); - } - closedir(dh); - } -#ifdef HAVE_PCRE - pcre2_code_free(re); - pcre2_match_data_free(match_data); + +/* Use libpcreposix if it's available, it's cooler */ +#if defined(HAVE_libpcre2_posix) +#warning hie +#include +#undef REG_NOSUB +#define REG_NOSUB 0 /* we do want backreferences in PCRE mode */ +#elif defined(HAVE_libpcreposix) +#include +#undef REG_NOSUB +#define REG_NOSUB 0 /* we do want backreferences in PCRE mode */ +#else +#include /* regcomp(), regsearch() */ #endif - return 0; + +#ifdef HAVE_XATTR +#include /* listxattr, getxattr */ +#endif + +/** + * struct file - Information about a file + * @st: The stat buffer associated with the file + * @next: Next file with the same size + * @basename: The offset off the basename in the filename + * @path: The path of the file + * + * This contains all information we need about a file. + */ +struct file { + struct stat st; + struct file *next; + struct link { + struct link *next; + int basename; +#if __STDC_VERSION__ >= 199901L + char path[]; +#elif __GNUC__ + char path[0]; +#else + char path[1]; +#endif + } *links; +}; + +/** + * enum log_level - Logging levels + * @JLOG_SYSFAT: Fatal error message with errno, will be printed to stderr + * @JLOG_FATAL: Fatal error message with errno, will be printed to stderr + * @JLOG_SYSERR: Error message with errno, will be printed to stderr + * @JLOG_ERROR: Error message, will be printed to stderr + * @JLOG_SUMMARY: Default log level + * @JLOG_INFO: Verbose logging (verbose == 1) + * @JLOG_DEBUG1: Verbosity 2 + * @JLOG_DEBUG2: Verbosity 3 + */ +enum log_level { + JLOG_SYSFAT = -4, + JLOG_FATAL = -3, + JLOG_SYSERR = -2, + JLOG_ERROR = -1, + JLOG_SUMMARY, + JLOG_INFO, + JLOG_DEBUG1, + JLOG_DEBUG2 +}; + +/** + * struct statistic - Statistics about the file + * @started: Whether we are post command-line processing + * @files: The number of files worked on + * @linked: The number of files replaced by a hardlink to a master + * @xattr_comparisons: The number of extended attribute comparisons + * @comparisons: The number of comparisons + * @saved: The (exaggerated) amount of space saved + * @start_time: The time we started at, in seconds since some unspecified point + */ +static struct statistics { + hl_bool started; + size_t files; + size_t linked; + size_t xattr_comparisons; + size_t comparisons; + double saved; + double start_time; +} stats; + +/** + * struct options - Processed command-line options + * @include: A linked list of regular expressions for the --include option + * @exclude: A linked list of regular expressions for the --exclude option + * @verbosity: The verbosity. Should be one of #enum log_level + * @respect_mode: Whether to respect file modes (default = TRUE) + * @respect_owner: Whether to respect file owners (uid, gid; default = TRUE) + * @respect_name: Whether to respect file names (default = FALSE) + * @respect_time: Whether to respect file modification times (default = TRUE) + * @respect_xattrs: Whether to respect extended attributes (default = FALSE) + * @maximise: Chose the file with the highest link count as master + * @minimise: Chose the file with the lowest link count as master + * @keep_oldest: Choose the file with oldest timestamp as master (default = FALSE) + * @dry_run: Specifies whether hardlink should not link files (default = FALSE) + * @min_size: Minimum size of files to consider. (default = 1 byte) + */ +static struct options { + struct regex_link { + regex_t preg; + struct regex_link *next; + } *include, *exclude; + signed int verbosity; + unsigned int respect_mode:1; + unsigned int respect_owner:1; + unsigned int respect_name:1; + unsigned int respect_time:1; + unsigned int respect_xattrs:1; + unsigned int maximise:1; + unsigned int minimise:1; + unsigned int keep_oldest:1; + unsigned int dry_run:1; + unsigned long long min_size; +} opts; + +/* + * files + * + * A binary tree of files, managed using tsearch(). To see which nodes + * are considered equal, see compare_nodes() + */ +static void *files; +static void *files_by_ino; + +/* + * last_signal + * + * The last signal we received. We store the signal here in order to be able + * to break out of loops gracefully and to return from our nftw() handler. + */ +static int last_signal; + +__attribute__ ((format(printf, 2, 3))) +/** + * jlog - Logging for hardlink + * @level: The log level + * @format: A format string for printf() + */ +static void jlog(enum log_level level, const char *format, ...) +{ + FILE *stream = (level >= 0) ? stdout : stderr; + int errno_ = errno; + va_list args; + + if (level <= opts.verbosity) { + if (level <= JLOG_FATAL) + fprintf(stream, "ERROR: "); + else if (level < 0) + fprintf(stream, "WARNING: "); + va_start(args, format); + vfprintf(stream, format, args); + va_end(args); + if (level == JLOG_SYSERR || level == JLOG_SYSFAT) + fprintf(stream, ": %s\n", strerror(errno_)); + else + fputc('\n', stream); + } +} + +/** + * CMP - Compare two numerical values, return 1, 0, or -1 + * @a: First value + * @b: Second value + * + * Used to compare two integers of any size while avoiding overflow. + */ +#define CMP(a, b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0)) + +/** + * format - Print a human-readable name for the given size + * @bytes: A number specifying an amount of bytes + * + * Uses a double. The result with infinity and NaN is most likely + * not pleasant. + */ +static const char *format(double bytes) +{ + static char buf[256]; + + if (bytes >= 1024 * 1024 * 1024) + snprintf(buf, sizeof(buf), "%.2f GiB", (bytes / 1024 / 1024 / 1024)); + else if (bytes >= 1024 * 1024) + snprintf(buf, sizeof(buf), "%.2f MiB", (bytes / 1024 / 1024)); + else if (bytes >= 1024) + snprintf(buf, sizeof(buf), "%.2f KiB", (bytes / 1024)); + else + snprintf(buf, sizeof(buf), "%.0f bytes", bytes); + + return buf; +} + +/** + * gettime() - Get the current time from the system + */ +static double gettime(void) +{ + struct timeval tv = { 0, 0 }; + + if (gettimeofday(&tv, NULL) != 0) + jlog(JLOG_SYSERR, "Cannot read current time"); + + return (double) tv.tv_sec + (double) tv.tv_usec / 1000000; +} + +/** + * regexec_any - Match against multiple regular expressions + * @pregs: A linked list of regular expressions + * @what: The string to match against + * + * Checks whether any of the regular expressions in the list matches the + * string. + */ +static hl_bool regexec_any(struct regex_link *pregs, const char *what) +{ + for (; pregs != NULL; pregs = pregs->next) + if (regexec(&pregs->preg, what, 0, NULL, 0) == 0) + return TRUE; + return FALSE; +} + +/** + * compare_nodes - Node comparison function + * @_a: The first node (a #struct file) + * @_b: The second node (a #struct file) + * + * Compare the two nodes for the binary tree. + */ +static int compare_nodes(const void *_a, const void *_b) +{ + const struct file *a = _a; + const struct file *b = _b; + int diff = 0; + + if (diff == 0) + diff = CMP(a->st.st_dev, b->st.st_dev); + if (diff == 0) + diff = CMP(a->st.st_size, b->st.st_size); + + return diff; +} + +/** + * compare_nodes_ino - Node comparison function + * @_a: The first node (a #struct file) + * @_b: The second node (a #struct file) + * + * Compare the two nodes for the binary tree. + */ +static int compare_nodes_ino(const void *_a, const void *_b) +{ + const struct file *a = _a; + const struct file *b = _b; + int diff = 0; + + if (diff == 0) + diff = CMP(a->st.st_dev, b->st.st_dev); + if (diff == 0) + diff = CMP(a->st.st_ino, b->st.st_ino); + + /* If opts.respect_name is used, we will restrict a struct file to + * contain only links with the same basename to keep the rest simple. + */ + if (diff == 0 && opts.respect_name) + diff = strcmp(a->links->path + a->links->basename, + b->links->path + b->links->basename); + + return diff; +} + +/** + * print_stats - Print statistics to stdout + */ +static void print_stats(void) +{ + jlog(JLOG_SUMMARY, "Mode: %s", opts.dry_run ? "dry-run" : "real"); + jlog(JLOG_SUMMARY, "Files: %zu", stats.files); + jlog(JLOG_SUMMARY, "Linked: %zu files", stats.linked); +#ifdef HAVE_XATTR + jlog(JLOG_SUMMARY, "Compared: %zu xattrs", stats.xattr_comparisons); +#endif + jlog(JLOG_SUMMARY, "Compared: %zu files", stats.comparisons); + jlog(JLOG_SUMMARY, "Saved: %s", format(stats.saved)); + jlog(JLOG_SUMMARY, "Duration: %.2f seconds", gettime() - stats.start_time); +} + +/** + * handle_interrupt - Handle a signal + * + * Returns: %TRUE on SIGINT, SIGTERM; %FALSE on all other signals. + */ +static hl_bool handle_interrupt(void) +{ + switch (last_signal) { + case SIGINT: + case SIGTERM: + return TRUE; + case SIGUSR1: + print_stats(); + putchar('\n'); + break; + } + + last_signal = 0; + return FALSE; +} + +#ifdef HAVE_XATTR + +/** + * malloc_or_die -- Wrapper for malloc() + * + * This does the same thing as malloc() except that it aborts if memory + * can't be allocated. + */ +static void *malloc_or_die(size_t size) +{ + void *mem = malloc(size); + + if (!mem) { + jlog(JLOG_SYSFAT, "Cannot allocate memory"); + exit(1); + } + return mem; +} + +/** + * llistxattr_or_die - Wrapper for llistxattr() + * + * This does the same thing as llistxattr() except that it aborts if any error + * other than "not supported" is detected. + */ +static ssize_t llistxattr_or_die(const char *path, char *list, size_t size) +{ + ssize_t len = llistxattr(path, list, size); + + if (len < 0 && errno != ENOTSUP) { + jlog(JLOG_SYSFAT, "Cannot get xattr names for %s", path); + exit(1); + } + return len; +} + +/** + * lgetxattr_or_die - Wrapper for lgetxattr() + * + * This does the same thing as lgetxattr() except that it aborts upon error. + */ +static ssize_t lgetxattr_or_die(const char *path, const char *name, void *value, + size_t size) +{ + ssize_t len = lgetxattr(path, name, value, size); + + if (len < 0) { + jlog(JLOG_SYSFAT, "Cannot get xattr value of %s for %s", name, path); + exit(1); + } + return len; +} + +/** + * get_xattr_name_count - Count the number of xattr names + * @names: a non-empty table of concatenated, null-terminated xattr names + * @len: the total length of the table + * + * @Returns the number of xattr names + */ +static int get_xattr_name_count(const char *const names, ssize_t len) +{ + int count = 0; + const char *name; + + for (name = names; name < (names + len); name += strlen(name) + 1) + count++; + + return count; +} + +/** + * cmp_xattr_name_ptrs - Compare two pointers to xattr names by comparing + * the names they point to. + */ +static int cmp_xattr_name_ptrs(const void *ptr1, const void *ptr2) +{ + return strcmp(*(char *const *) ptr1, *(char *const *) ptr2); +} + +/** + * get_sorted_xattr_name_table - Create a sorted table of xattr names. + * @names - table of concatenated, null-terminated xattr names + * @n - the number of names + * + * @Returns allocated table of pointers to the names, sorted alphabetically + */ +static const char **get_sorted_xattr_name_table(const char *names, int n) +{ + const char **table = malloc_or_die(n * sizeof(char *)); + int i; + + for (i = 0; i < n; i++) { + table[i] = names; + names += strlen(names) + 1; + } + + qsort(table, n, sizeof(char *), cmp_xattr_name_ptrs); + + return table; +} + +/** + * file_xattrs_equal - Compare the extended attributes of two files + * @a: The first file + * @b: The second file + * + * @Returns: %TRUE if and only if extended attributes are equal + */ +static hl_bool file_xattrs_equal(const struct file *a, const struct file *b) +{ + ssize_t len_a; + ssize_t len_b; + char *names_a = NULL; + char *names_b = NULL; + int n_a; + int n_b; + const char **name_ptrs_a = NULL; + const char **name_ptrs_b = NULL; + void *value_a = NULL; + void *value_b = NULL; + hl_bool ret = FALSE; + int i; + + assert(a->links != NULL); + assert(b->links != NULL); + + jlog(JLOG_DEBUG1, "Comparing xattrs of %s to %s", a->links->path, + b->links->path); + + stats.xattr_comparisons++; + + len_a = llistxattr_or_die(a->links->path, NULL, 0); + len_b = llistxattr_or_die(b->links->path, NULL, 0); + + if (len_a <= 0 && len_b <= 0) + return TRUE; // xattrs not supported or neither file has any + + if (len_a != len_b) + return FALSE; // total lengths of xattr names differ + + names_a = malloc_or_die(len_a); + names_b = malloc_or_die(len_b); + + len_a = llistxattr_or_die(a->links->path, names_a, len_a); + len_b = llistxattr_or_die(b->links->path, names_b, len_b); + assert((len_a > 0) && (len_a == len_b)); + + n_a = get_xattr_name_count(names_a, len_a); + n_b = get_xattr_name_count(names_b, len_b); + + if (n_a != n_b) + goto exit; // numbers of xattrs differ + + name_ptrs_a = get_sorted_xattr_name_table(names_a, n_a); + name_ptrs_b = get_sorted_xattr_name_table(names_b, n_b); + + // We now have two sorted tables of xattr names. + + for (i = 0; i < n_a; i++) { + if (handle_interrupt()) + goto exit; // user wants to quit + + if (strcmp(name_ptrs_a[i], name_ptrs_b[i]) != 0) + goto exit; // names at same slot differ + + len_a = lgetxattr_or_die(a->links->path, name_ptrs_a[i], NULL, 0); + len_b = lgetxattr_or_die(b->links->path, name_ptrs_b[i], NULL, 0); + + if (len_a != len_b) + goto exit; // xattrs with same name, different value lengths + + value_a = malloc_or_die(len_a); + value_b = malloc_or_die(len_b); + + len_a = lgetxattr_or_die(a->links->path, name_ptrs_a[i], + value_a, len_a); + len_b = lgetxattr_or_die(b->links->path, name_ptrs_b[i], + value_b, len_b); + assert((len_a >= 0) && (len_a == len_b)); + + if (memcmp(value_a, value_b, len_a) != 0) + goto exit; // xattrs with same name, different values + + free(value_a); + free(value_b); + value_a = NULL; + value_b = NULL; + } + + ret = TRUE; + + exit: + free(names_a); + free(names_b); + free(name_ptrs_a); + free(name_ptrs_b); + free(value_a); + free(value_b); + return ret; +} +#else +static hl_bool file_xattrs_equal(const struct file *a, const struct file *b) +{ + return TRUE; +} +#endif + +/** + * file_contents_equal - Compare contents of two files for equality + * @a: The first file + * @b: The second file + * + * Compare the contents of the files for equality + */ +static hl_bool file_contents_equal(const struct file *a, const struct file *b) +{ + FILE *fa = NULL; + FILE *fb = NULL; + char buf_a[8192]; + char buf_b[8192]; + int cmp = 0; /* zero => equal */ + off_t off = 0; /* current offset */ + + assert(a->links != NULL); + assert(b->links != NULL); + + jlog(JLOG_DEBUG1, "Comparing %s to %s", a->links->path, b->links->path); + + stats.comparisons++; + + if ((fa = fopen(a->links->path, "rb")) == NULL) + goto err; + if ((fb = fopen(b->links->path, "rb")) == NULL) + goto err; + + posix_fadvise(fileno(fa), 0, 0, POSIX_FADV_SEQUENTIAL); + posix_fadvise(fileno(fb), 0, 0, POSIX_FADV_SEQUENTIAL); + + while (!handle_interrupt() && cmp == 0) { + size_t ca; + size_t cb; + + ca = fread(buf_a, 1, sizeof(buf_a), fa); + if (ca < sizeof(buf_a) && ferror(fa)) + goto err; + + cb = fread(buf_b, 1, sizeof(buf_b), fb); + if (cb < sizeof(buf_b) && ferror(fb)) + goto err; + + off += ca; + + if ((ca != cb || ca == 0)) { + cmp = CMP(ca, cb); + break; + } + cmp = memcmp(buf_a, buf_b, ca); + } + out: + if (fa != NULL) + fclose(fa); + if (fb != NULL) + fclose(fb); + return !handle_interrupt() && cmp == 0; + err: + if (fa == NULL || fb == NULL) + jlog(JLOG_SYSERR, "Cannot open %s", + fa ? b->links->path : a->links->path); + else + jlog(JLOG_SYSERR, "Cannot read %s", + ferror(fa) ? a->links->path : b->links->path); + cmp = 1; + goto out; +} + +/** + * file_may_link_to - Check whether a file may replace another one + * @a: The first file + * @b: The second file + * + * Check whether the two fies are considered equal and can be linked + * together. If the two files are identical, the result will be FALSE, + * as replacing a link with an identical one is stupid. + */ +static hl_bool file_may_link_to(const struct file *a, const struct file *b) +{ + return (a->st.st_size != 0 && + a->st.st_size == b->st.st_size && + a->links != NULL && b->links != NULL && + a->st.st_dev == b->st.st_dev && + a->st.st_ino != b->st.st_ino && + (!opts.respect_mode || a->st.st_mode == b->st.st_mode) && + (!opts.respect_owner || a->st.st_uid == b->st.st_uid) && + (!opts.respect_owner || a->st.st_gid == b->st.st_gid) && + (!opts.respect_time || a->st.st_mtime == b->st.st_mtime) && + (!opts.respect_name + || strcmp(a->links->path + a->links->basename, + b->links->path + b->links->basename) == 0) && + (!opts.respect_xattrs || file_xattrs_equal(a, b)) && + file_contents_equal(a, b)); +} + +/** + * file_compare - Compare two files to decide which should be master + * @a: The first file + * @b: The second file + * + * Check which of the files should be considered greater and thus serve + * as the master when linking (the master is the file that all equal files + * will be replaced with). + */ +static int file_compare(const struct file *a, const struct file *b) +{ + int res = 0; + if (a->st.st_dev == b->st.st_dev && a->st.st_ino == b->st.st_ino) + return 0; + + if (res == 0 && opts.maximise) + res = CMP(a->st.st_nlink, b->st.st_nlink); + if (res == 0 && opts.minimise) + res = CMP(b->st.st_nlink, a->st.st_nlink); + if (res == 0) + res = opts.keep_oldest ? CMP(b->st.st_mtime, a->st.st_mtime) + : CMP(a->st.st_mtime, b->st.st_mtime); + if (res == 0) + res = CMP(b->st.st_ino, a->st.st_ino); + + return res; +} + +/** + * file_link - Replace b with a link to a + * @a: The first file + * @b: The second file + * + * Link the file, replacing @b with the current one. The file is first + * linked to a temporary name, and then renamed to the name of @b, making + * the replace atomic (@b will always exist). + */ +static hl_bool file_link(struct file *a, struct file *b) +{ + file_link: + assert(a->links != NULL); + assert(b->links != NULL); + + jlog(JLOG_INFO, "%sLinking %s to %s (-%s)", + opts.dry_run ? "[DryRun] " : "", a->links->path, b->links->path, + format(a->st.st_size)); + + if (!opts.dry_run) { + size_t len = strlen(b->links->path) + strlen(".hardlink-temporary") + 1; + char *new_path = malloc(len); + + if (new_path == NULL) { + jlog(JLOG_SYSFAT, "Cannot allocate memory"); + exit(1); + } + + snprintf(new_path, len, "%s.hardlink-temporary", b->links->path); + + if (link(a->links->path, new_path) != 0) { + jlog(JLOG_SYSERR, "Cannot link %s to %s", a->links->path, new_path); + free(new_path); + return FALSE; + } else if (rename(new_path, b->links->path) != 0) { + jlog(JLOG_SYSERR, "Cannot rename %s to %s", a->links->path, + new_path); + unlink(new_path); /* cleanup failed rename */ + free(new_path); + return FALSE; + } + free(new_path); + } + + /* Update statistics */ + stats.linked++; + + /* Increase the link count of this file, and set stat() of other file */ + a->st.st_nlink++; + b->st.st_nlink--; + + if (b->st.st_nlink == 0) + stats.saved += a->st.st_size; + + /* Move the link from file b to a */ + { + struct link *new_link = b->links; + + b->links = b->links->next; + new_link->next = a->links->next; + a->links->next = new_link; + } + + // Do it again + if (b->links) + goto file_link; + + return TRUE; +} + +/** + * inserter - Callback function for nftw() + * @fpath: The path of the file being visited + * @sb: The stat information of the file + * @typeflag: The type flag + * @ftwbuf: Contains current level of nesting and offset of basename + * + * Called by nftw() for the files. See the manual page for nftw() for + * further information. + */ +static int inserter(const char *fpath, const struct stat *sb, int typeflag, + struct FTW *ftwbuf) +{ + struct file *fil; + struct file **node; + size_t pathlen; + hl_bool included; + hl_bool excluded; + + if (handle_interrupt()) + return 1; + if (typeflag == FTW_DNR || typeflag == FTW_NS) + jlog(JLOG_SYSERR, "Cannot read %s", fpath); + if (typeflag != FTW_F || !S_ISREG(sb->st_mode)) + return 0; + + included = regexec_any(opts.include, fpath); + excluded = regexec_any(opts.exclude, fpath); + + if ((opts.exclude && excluded && !included) || + (!opts.exclude && opts.include && !included)) + return 0; + + stats.files++; + + if (sb->st_size < opts.min_size) { + jlog(JLOG_DEBUG1, "Skipped %s (smaller than configured size)", fpath); + return 0; + } + + jlog(JLOG_DEBUG2, "Visiting %s (file %zu)", fpath, stats.files); + + pathlen = strlen(fpath) + 1; + + fil = calloc(1, sizeof(*fil)); + + if (fil == NULL) + return jlog(JLOG_SYSFAT, "Cannot continue"), 1; + + fil->links = calloc(1, sizeof(struct link) + pathlen); + + if (fil->links == NULL) + return jlog(JLOG_SYSFAT, "Cannot continue"), 1; + + fil->st = *sb; + fil->links->basename = ftwbuf->base; + fil->links->next = NULL; + + memcpy(fil->links->path, fpath, pathlen); + + node = tsearch(fil, &files_by_ino, compare_nodes_ino); + + if (node == NULL) + return jlog(JLOG_SYSFAT, "Cannot continue"), 1; + + if (*node != fil) { + /* Already known inode, add link to inode information */ + assert((*node)->st.st_dev == sb->st_dev); + assert((*node)->st.st_ino == sb->st_ino); + + fil->links->next = (*node)->links; + (*node)->links = fil->links; + + free(fil); + } else { + /* New inode, insert into by-size table */ + node = tsearch(fil, &files, compare_nodes); + + if (node == NULL) + return jlog(JLOG_SYSFAT, "Cannot continue"), 1; + + if (*node != fil) { + struct file *l; + + if (file_compare(fil, *node) >= 0) { + fil->next = *node; + *node = fil; + } else { + for (l = *node; l != NULL; l = l->next) { + if (l->next != NULL && file_compare(fil, l->next) < 0) + continue; + + fil->next = l->next; + l->next = fil; + + break; + } + } + } + } + + return 0; +} + +/** + * visitor - Callback for twalk() + * @nodep: Pointer to a pointer to a #struct file + * @which: At which point this visit is (preorder, postorder, endorder) + * @depth: The depth of the node in the tree + * + * Visit the nodes in the binary tree. For each node, call hardlinker() + * on each #struct file in the linked list of #struct file instances located + * at that node. + */ +static void visitor(const void *nodep, const VISIT which, const int depth) +{ + struct file *master = *(struct file **) nodep; + struct file *other; + + (void) depth; + + if (which != leaf && which != endorder) + return; + + for (; master != NULL; master = master->next) { + if (handle_interrupt()) + exit(1); + if (master->links == NULL) + continue; + + for (other = master->next; other != NULL; other = other->next) { + if (handle_interrupt()) + exit(1); + + assert(other != other->next); + assert(other->st.st_size == master->st.st_size); + + if (other->links == NULL || !file_may_link_to(master, other)) + continue; + + if (!file_link(master, other) && errno == EMLINK) + master = other; + } + } +} + +/** + * version - Print the program version and exit + */ +static int version(void) +{ + printf("hardlink 0.3 RC2\n"); + printf("Compiled %s at %s\n", __DATE__, __TIME__); + exit(0); +} + +/** + * help - Print the program help and exit + * @name: The name of the program executable (argv[0]) + */ +static int help(const char *name) +{ + printf("Usage: %s [options] directory|file ...\n", name); + puts("Options:"); + puts(" -V, --version show program's version number and exit"); + puts(" -h, --help show this help message and exit"); + puts(" -v, --verbose Increase verbosity (repeat for more verbosity)"); + puts(" -n, --dry-run Modify nothing, just print what would happen"); + puts(" -f, --respect-name Filenames have to be identical"); + puts(" -p, --ignore-mode Ignore changes of file mode"); + puts(" -o, --ignore-owner Ignore owner changes"); + puts(" -t, --ignore-time Ignore timestamps (when testing for equality)"); +#ifdef HAVE_XATTR + puts(" -X, --respect-xattrs Respect extended attributes"); +#endif + puts(" -m, --maximize Maximize the hardlink count, remove the file with"); + puts(" lowest hardlink cout"); + puts(" -M, --minimize Reverse the meaning of -m"); + puts(" -O, --keep-oldest Keep the oldest file of multiple equal files"); + puts(" (lower precedence than minimize/maximize)"); + puts(" -x REGEXP, --exclude=REGEXP"); + puts(" Regular expression to exclude files"); + puts(" -i REGEXP, --include=REGEXP"); + puts(" Regular expression to include files/dirs"); + puts(" -s [K,M,G], --minimum-size=[K,M,G]"); + puts(" Minimum size for files. Optional suffix"); + puts(" allows for using KiB, MiB, or GiB"); + puts(""); + puts("Compatibility options to Jakub Jelinek's hardlink:"); + puts(" -c Compare only file contents, same as -pot"); + +#ifndef HAVE_GETOPT_LONG + puts(""); + puts("Your system only supports the short option names given above."); +#endif + exit(0); +} + +/** + * register_regex - Compile and insert a regular expression into list + * @pregs: Pointer to a linked list of regular expressions + * @regex: String containing the regular expression to be compiled + */ +static int register_regex(struct regex_link **pregs, const char *regex) +{ + struct regex_link *link; + int err; + + link = malloc(sizeof(*link)); + + if (link == NULL) { + jlog(JLOG_SYSFAT, "Cannot allocate memory"); + exit(1); + } + + if ((err = regcomp(&link->preg, regex, REG_NOSUB | REG_EXTENDED)) != 0) { + size_t size = regerror(err, &link->preg, NULL, 0); + char *buf = malloc(size + 1); + + if (buf == NULL) { + jlog(JLOG_SYSFAT, "Cannot allocate memory"); + exit(1); + } + + regerror(err, &link->preg, buf, size); + + jlog(JLOG_FATAL, "Could not compile regular expression %s: %s", + regex, buf); + free(buf); + free(link); + return 1; + } + + link->next = *pregs; + *pregs = link; + return 0; +} + +/** + * parse_options - Parse the command line options + * @argc: Number of options + * @argv: Array of options + */ +static int parse_options(int argc, char *argv[]) +{ + static const char optstr[] = "VhvnfpotXcmMOx:i:s:"; +#ifdef HAVE_GETOPT_LONG + static const struct option long_options[] = { + {"version", no_argument, NULL, 'V'}, + {"help", no_argument, NULL, 'h'}, + {"verbose", no_argument, NULL, 'v'}, + {"dry-run", no_argument, NULL, 'n'}, + {"respect-name", no_argument, NULL, 'f'}, + {"ignore-mode", no_argument, NULL, 'p'}, + {"ignore-owner", no_argument, NULL, 'o'}, + {"ignore-time", no_argument, NULL, 't'}, + {"respect-xattrs", no_argument, NULL, 'X'}, + {"maximize", no_argument, NULL, 'm'}, + {"minimize", no_argument, NULL, 'M'}, + {"keep-oldest", no_argument, NULL, 'O'}, + {"exclude", required_argument, NULL, 'x'}, + {"include", required_argument, NULL, 'i'}, + {"minimum-size", required_argument, NULL, 's'}, + {NULL, 0, NULL, 0} + }; +#endif + + int opt; + char unit = '\0'; + + opts.respect_mode = TRUE; + opts.respect_owner = TRUE; + opts.respect_time = TRUE; + opts.respect_xattrs = FALSE; + opts.keep_oldest = FALSE; + opts.min_size = 1; + + while ((opt = getopt_long(argc, argv, optstr, long_options, NULL)) != -1) { + switch (opt) { + case 'p': + opts.respect_mode = FALSE; + break; + case 'o': + opts.respect_owner = FALSE; + break; + case 't': + opts.respect_time = FALSE; + break; + case 'X': + opts.respect_xattrs = TRUE; + break; + case 'm': + opts.maximise = TRUE; + break; + case 'M': + opts.minimise = TRUE; + break; + case 'O': + opts.keep_oldest = TRUE; + break; + case 'f': + opts.respect_name = TRUE; + break; + case 'v': + opts.verbosity++; + break; + case 'c': + opts.respect_mode = FALSE; + opts.respect_name = FALSE; + opts.respect_owner = FALSE; + opts.respect_time = FALSE; + opts.respect_xattrs = FALSE; + break; + case 'n': + opts.dry_run = 1; + break; + case 'h': + return help(argv[0]); + case 'V': + return version(); + case 'x': + if (register_regex(&opts.exclude, optarg) != 0) + return 1; + break; + case 'i': + if (register_regex(&opts.include, optarg) != 0) + return 1; + break; + case 's': + if (sscanf(optarg, "%llu%c", &opts.min_size, &unit) < 1) { + jlog(JLOG_ERROR, "Invalid option given to -s: %s", optarg); + return 1; + } + switch (tolower(unit)) { + case '\0': + break; + case 't': + opts.min_size *= 1024; + case 'g': + opts.min_size *= 1024; + case 'm': + opts.min_size *= 1024; + case 'k': + opts.min_size *= 1024; + break; + default: + jlog(JLOG_ERROR, "Unknown unit indicator %c.", unit); + return 1; + } + jlog(JLOG_DEBUG1, "Using minimum size of %lld bytes.", + opts.min_size); + break; + case '?': + return 1; + default: + jlog(JLOG_ERROR, "Unexpected invalid option: -%c\n", opt); + return 1; + } + } + return 0; +} + +/** + * to_be_called_atexit - Cleanup handler, also prints statistics. + */ +static void to_be_called_atexit(void) +{ + if (stats.started) + print_stats(); +} + +/** + * sighandler - Signal handler, sets the global last_signal variable + * @i: The signal number + */ +static void sighandler(int i) +{ + if (last_signal != SIGINT) + last_signal = i; + if (i == SIGINT) + putchar('\n'); +} + +int main(int argc, char *argv[]) +{ + struct sigaction sa; + + sa.sa_handler = sighandler; + sa.sa_flags = SA_RESTART; + sigfillset(&sa.sa_mask); + + /* If we receive a SIGINT, end the processing */ + sigaction(SIGINT, &sa, NULL); + sigaction(SIGUSR1, &sa, NULL); + + /* Pretty print numeric output */ + setlocale(LC_NUMERIC, ""); + stats.start_time = gettime(); + + if (atexit(to_be_called_atexit) != 0) { + jlog(JLOG_SYSFAT, "Cannot register exit handler"); + return 1; + } + + if (parse_options(argc, argv) != 0) + return 1; + + if (optind == argc) { + jlog(JLOG_FATAL, "Expected file or directory names"); + return 1; + } + + stats.started = TRUE; + + for (; optind < argc; optind++) + if (nftw(argv[optind], inserter, 20, FTW_PHYS) == -1) + jlog(JLOG_SYSERR, "Cannot process %s", argv[optind]); + + twalk(files, visitor); + + return 0; }