For me regex solution was not really comfortable enough. As I usually start cleaning with a pure text I thought I'll do something really nice and wrote this small peace:
Code:
/*
stripLF.c - remove false line breaks
compile with: cc stripLF.c -o stripLF
usage:
stripLF -h
stripLF [-C][-p][-H] < file.in > file.out
cat file.in | stripLF [-C][-p][-H] >file.out
options:
-h: print help text and exit
-C: line break before [C]apitals is legitimate too (for poetry)
-p: change line breaks into </p> LF <p>
-H: change text into bare html page
removes all carridge returns
removes all line breaks which are not
preceded by . _ ! ? * ' "]> other another line break or followed by capital letter: option -C
removes multiple spaces
(c)varlog 2013
LICENSE: FREE FOR ALL
*/
#include <stdio.h>
#define LF 0x0A
#define CR 0x0D
#define SPACE 0x20
#define SINGLE_QUOTE 0x27
#define DOPPEL_QUOTE 0x22
#define VERSION 1.02
void usage(){
printf("\n**********************************************************\n");
printf("stripLF: remove false line breaks \n");
printf("usage: \n");
printf("stripLF [-h] \n");
printf("stripLF [-C] [-p] [-H] < file.in > file.out \n");
printf("cat file.in |stripLF [-C] [-p] [-H] > file.out \n");
printf("options:\n");
printf("-h: print this help text and exit\n");
printf("-C: line break before [C]apitals is legitimate too\n");
printf("-p: change line breaks into </p> LF <p>\n");
printf("-H: add <html><body>......</body></html> tokens, implies -p \n");
printf("v %.2f 2013 (c) varlog\n",VERSION);
printf("***********************************************************\n");
}
main(int argc, char **argv)
{
int ch,pch=LF,nch=0;
int i;
int Cflag=0;
int Hflag=0;
int pflag=0;
int eflag=0;
if(argc>1){
for(i=1 ;i<argc; i++){
if(argv[i][0]=='-') {
switch (argv[i][1]){
case 'C':
Cflag=1; //capitals
break;
case 'p': // LF --> </p><p>
pflag=1;
break;
case 'h': // help
usage();
eflag=1;
break;
case 'H': //-->html
Hflag=1;
pflag=1;
break;
default:
break;
}
}
}
}
if(Hflag) printf("<html>\n<body>\n");
if(pflag) printf("<p>");
while(!eflag)
{
ch = getchar();
if(ch==EOF) break;
if(ch==SPACE && pch==SPACE) {
; //remove space if more than one by ignoring it
}else{
if(ch!=LF && ch!=CR) {
putchar(ch); //just next letter
}else {
if(ch==CR){
ch=pch; //remove CR by ignoring it
}else {
while((nch=getchar())==SPACE); //get next char ignoring SPACE
if(nch==EOF)
{
putchar(ch);
break;
}
if( // it is line break!
pch==']'||
pch=='>'||
pch=='*'||
pch=='_'||
pch=='.'||
pch=='!'||
pch=='?'||
pch==SINGLE_QUOTE||
pch==DOPPEL_QUOTE||
pch==LF||
nch==LF||
(Cflag==1 && nch>=0x40 && nch<=0x5A) //capitals and @
) {
if(pflag) printf("</p>");
putchar(ch);
if(pflag) printf("<p>");
putchar(nch);
ch=nch;
}
else { //phony line break
putchar(SPACE); //change LF into space
putchar(nch);
ch=nch;
}
}
}
pch=ch;
}
} //end while
if(pflag) printf("</p>");
if(Hflag) printf("\n</body>\n</html>");
}
of course it is not really nice and needs manual correction in the end
- but it works for me
.