//: C03:HTMLStripper.cpp {RunByHand}
//{L} ReplaceAll
// Filter to remove html tags and markers.
#include <cassert>
#include <cmath>
#include <cstddef>
#include <fstream>
#include <iostream>
#include <string>
#include "ReplaceAll.h"
#include "../require.h"
using namespace std;
string& stripHTMLTags(string& s) {
static bool inTag = false;
bool done = false;
while(!done) {
if(inTag) {
// The previous line started an HTML tag
// but didn't finish. Must search for '>'.
size_t rightPos = s.find('>');
if(rightPos != string::npos) {
inTag = false;
s.erase(0, rightPos + 1);
}
else {
done = true;
s.erase();
}
}
else {
// Look for start of tag:
size_t leftPos = s.find('<');
if(leftPos != string::npos) {
// See if tag close is in this line:
size_t rightPos = s.find('>');
if(rightPos == string::npos) {
inTag = done = true;
s.erase(leftPos);
}
else
s.erase(leftPos, rightPos - leftPos + 1);
}
else
done = true;
}
}
// Remove all special HTML characters
replaceAll(s, "<",
"<");
replaceAll(s, ">",
">");
replaceAll(s, "&",
"&");
replaceAll(s, " ", " ");
// Etc...
return s;
}
int main(int argc, char* argv[]) {
requireArgs(argc, 1,
"usage: HTMLStripper InputFile");
ifstream in(argv[1]);
assure(in, argv[1]);
string s;
while(getline(in, s))
if(!stripHTMLTags(s).empty())
cout << s << endl;
} ///:~
This example will even strip HTML tags that span multiple
lines. This is
accomplished with the static flag, inTag, which is true whenever
the start of a tag is found, but the accompanying tag end is not found in the
same line. All forms of erase( ) appear in the stripHTMLFlags( )
function. The
version of getline( ) we use here is a (global) function declared
in the <string> header and is handy because it stores an
arbitrarily long line in its string argument. You don t need to worry
about the dimension of a character array as you do with istream::getline( ).
Notice that this program uses the replaceAll( ) function from
earlier in this chapter. In the next chapter, we ll use string streams to
create a more elegant solution.