aboutsummaryrefslogtreecommitdiff
path: root/usr.bin
diff options
context:
space:
mode:
authorTim J. Robbins <tjr@FreeBSD.org>2002-06-13 12:48:50 +0000
committerTim J. Robbins <tjr@FreeBSD.org>2002-06-13 12:48:50 +0000
commitebb42aee311aba4f36f4bf0fa5f80f358beed2c1 (patch)
tree64a9e16ffbefc587157bf0be0541b1a4ebcf98c5 /usr.bin
parenta446b510a438437a621611ecf7b697f71d74e1b8 (diff)
downloadsrc-ebb42aee311aba4f36f4bf0fa5f80f358beed2c1.tar.gz
src-ebb42aee311aba4f36f4bf0fa5f80f358beed2c1.zip
Add the -m option, which counts characters (as opposed to -c, which
counts bytes). In locales that don't have multibyte characters, -m is effectively an alias for -c. This brings wc(1) up to P1003.1-2001 conformance.
Notes
Notes: svn path=/head/; revision=98165
Diffstat (limited to 'usr.bin')
-rw-r--r--usr.bin/wc/wc.140
-rw-r--r--usr.bin/wc/wc.c67
2 files changed, 83 insertions, 24 deletions
diff --git a/usr.bin/wc/wc.1 b/usr.bin/wc/wc.1
index fc535e2693a0..92abd0f03ad4 100644
--- a/usr.bin/wc/wc.1
+++ b/usr.bin/wc/wc.1
@@ -40,10 +40,10 @@
.Os
.Sh NAME
.Nm wc
-.Nd word, line, and byte count
+.Nd word, line, character, and byte count
.Sh SYNOPSIS
.Nm
-.Op Fl clw
+.Op Fl clmw
.Op Ar
.Sh DESCRIPTION
The
@@ -71,6 +71,12 @@ is written to the standard output.
.It Fl l
The number of lines in each input file
is written to the standard output.
+.It Fl m
+The number of characters in each input file is written to the standard output.
+If the current locale does not support multibyte characters, this
+is equivalent to the
+.Fl c
+option.
.It Fl w
The number of words in each input file
is written to the standard output.
@@ -79,10 +85,36 @@ is written to the standard output.
When an option is specified,
.Nm
only reports the information requested by that option.
-The default action is equivalent to specifying all of the flags.
+The default action is equivalent to specifying the
+.Fl c ,
+.Fl l
+and
+.Fl w
+options.
.Pp
If no files are specified, the standard input is used and no
file name is displayed.
+.Sh ENVIRONMENT
+The
+.Ev LANG ,
+.Ev LC_ALL
+and
+.Ev LC_CTYPE
+environment variables affect the execution of
+.Nm
+as described in
+.Xr environ 7
+when the
+.Fl m
+option is specified.
+.Sh EXAMPLES
+Count the number of characters, words and lines in each of the files
+.Pa report1
+and
+.Pa report2
+as well as the totals for both:
+.Pp
+.Dl "wc -mlw report1 report2"
.Sh DIAGNOSTICS
.Ex -std
.Sh SEE ALSO
@@ -108,7 +140,7 @@ function, as required by
The
.Nm
function conforms to
-.St -p1003.2 .
+.St -p1003.1-2001 .
.Sh HISTORY
A
.Nm
diff --git a/usr.bin/wc/wc.c b/usr.bin/wc/wc.c
index d70d94093920..cacc08e03ef6 100644
--- a/usr.bin/wc/wc.c
+++ b/usr.bin/wc/wc.c
@@ -51,6 +51,7 @@ __FBSDID("$FreeBSD$");
#include <ctype.h>
#include <err.h>
+#include <errno.h>
#include <fcntl.h>
#include <locale.h>
#include <stdint.h>
@@ -60,7 +61,7 @@ __FBSDID("$FreeBSD$");
#include <unistd.h>
uintmax_t tlinect, twordct, tcharct;
-int doline, doword, dochar;
+int doline, doword, dochar, domulti;
static int cnt(const char *);
static void usage(void);
@@ -74,7 +75,7 @@ main(argc, argv)
(void) setlocale(LC_CTYPE, "");
- while ((ch = getopt(argc, argv, "lwc")) != -1)
+ while ((ch = getopt(argc, argv, "clmw")) != -1)
switch((char)ch) {
case 'l':
doline = 1;
@@ -84,6 +85,11 @@ main(argc, argv)
break;
case 'c':
dochar = 1;
+ domulti = 0;
+ break;
+ case 'm':
+ domulti = 1;
+ dochar = 0;
break;
case '?':
default:
@@ -93,7 +99,7 @@ main(argc, argv)
argc -= optind;
/* Wc's flags are on by default. */
- if (doline + doword + dochar == 0)
+ if (doline + doword + dochar + domulti == 0)
doline = doword = dochar = 1;
errors = 0;
@@ -117,7 +123,7 @@ main(argc, argv)
(void)printf(" %7ju", tlinect);
if (doword)
(void)printf(" %7ju", twordct);
- if (dochar)
+ if (dochar || domulti)
(void)printf(" %7ju", tcharct);
(void)printf(" total\n");
}
@@ -130,10 +136,12 @@ cnt(file)
{
struct stat sb;
uintmax_t linect, wordct, charct;
- int fd, len;
+ ssize_t nread;
+ int clen, fd, len, warned;
short gotsp;
u_char *p;
u_char buf[MAXBSIZE], ch;
+ wchar_t wch;
linect = wordct = charct = 0;
if (file == NULL) {
@@ -144,7 +152,7 @@ cnt(file)
warn("%s: open", file);
return (1);
}
- if (doword)
+ if (doword || (domulti && MB_CUR_MAX != 1))
goto word;
/*
* Line counting is split out because it's a lot faster to get
@@ -176,7 +184,7 @@ cnt(file)
* If all we need is the number of characters and it's a
* regular or linked file, just stat the puppy.
*/
- if (dochar) {
+ if (dochar || domulti) {
if (fstat(fd, &sb)) {
warn("%s: fstat", file);
(void)close(fd);
@@ -192,22 +200,41 @@ cnt(file)
}
/* Do it the hard way... */
-word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
- if (len == -1) {
+word: gotsp = 1;
+ len = 0;
+ warned = 0;
+ while ((nread = read(fd, buf + len, MAXBSIZE - len)) != 0) {
+ if (nread == -1) {
warn("%s: read", file);
(void)close(fd);
return (1);
}
- /*
- * This loses in the presence of multi-byte characters.
- * To do it right would require a function to return a
- * character while knowing how many bytes it consumed.
- */
- charct += len;
- for (p = buf; len--;) {
- ch = *p++;
- if (ch == '\n')
+ len += nread;
+ p = buf;
+ while (len > 0) {
+ if (!domulti || MB_CUR_MAX == 1) {
+ clen = 1;
+ wch = (unsigned char)*p;
+ } else if ((clen = mbtowc(&wch, p, len)) <= 0) {
+ if (len > MB_CUR_MAX) {
+ clen = 1;
+ wch = (unsigned char)*p;
+ if (!warned) {
+ errno = EILSEQ;
+ warn("%s", file);
+ warned = 1;
+ }
+ } else {
+ memmove(buf, p, len);
+ break;
+ }
+ }
+ charct++;
+ len -= clen;
+ p += clen;
+ if (wch == L'\n')
++linect;
+ /* XXX Non-portable; should use iswspace() */
if (isspace(ch))
gotsp = 1;
else if (gotsp) {
@@ -224,7 +251,7 @@ word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
twordct += wordct;
(void)printf(" %7ju", wordct);
}
- if (dochar) {
+ if (dochar || domulti) {
tcharct += charct;
(void)printf(" %7ju", charct);
}
@@ -235,6 +262,6 @@ word: for (gotsp = 1; (len = read(fd, buf, MAXBSIZE));) {
static void
usage()
{
- (void)fprintf(stderr, "usage: wc [-clw] [file ...]\n");
+ (void)fprintf(stderr, "usage: wc [-clmw] [file ...]\n");
exit(1);
}