//Programm zum Extrahieren des Textes aus HTML-Dateien
//---------------------------------------------------------------------------
#include <vcl.h>
#pragma hdrstop

#include "html1.h"
#include "Fortschritt.h"
//---------------------------------------------------------------------------
#pragma package(smart_init)
#pragma resource "*.dfm"
TForm1 *Form1;
//---------------------------------------------------------------------------
__fastcall TForm1::TForm1(TComponent* Owner)
    : TForm(Owner)
{
}
//---------------------------------------------------------------------------
void __fastcall TForm1::FormCreate(TObject *Sender)
{
  WindowState=wsMaximized;
//  Application->OnException = HandleError;
  String directory=DirectoryListBox->Directory;
  SaveDialog->InitialDir=directory;
  SucheFiles(Sender);
}
//---------------------------------------------------------------------------
void __fastcall TForm1::SucheFiles(TObject *Sender)
{
  PageControl1->ActivePage=TabSheet1;
  SpeichernBtn->Enabled=false;
  Cursor = crHourGlass ;
  ListBox1->Items->Clear() ;
  ListBox2->Items->Clear() ;
  String directory = DirectoryListBox->Directory ;
  if (directory.Length ()>3)
    directory = directory + "\\" ;
  FindFilesInDirectory (directory) ;
  if(ListBox1->Items->Count)SpeichernBtn->Enabled=true;
  Cursor = crDefault ;
}
//---------------------------------------------------------------------------
void TForm1::FindFilesInDirectory (const String &directory)
{
  WIN32_FIND_DATA filedata ;  // Structure for file data
  HANDLE filehandle ;         // Handle for searching
  filehandle = FindFirstFile ((directory + Edit->Text).c_str (), &filedata) ;
  if (filehandle != INVALID_HANDLE_VALUE)
  {
    do
    {
      String Str=directory; Str=Str+ filedata.cFileName;
      ListBox1->Items->Add (Str ) ;
    } while (FindNextFile (filehandle, &filedata)) ;
    FindClose (filehandle) ;
  }
  if(!CheckBox->Checked)return;
  // Pass 2 - Search for all the subdirectories within this directory
  String dir = directory + "*.*" ;
  filehandle = FindFirstFile ((directory + "*.*").c_str (), &filedata) ;
  if (filehandle != INVALID_HANDLE_VALUE)
  {
    do
    {
      if ((filedata.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) != 0
           && String (filedata.cFileName) != "."
           && String (filedata.cFileName) != "..")
      {
        // Recursive call here
        FindFilesInDirectory (directory + filedata.cFileName + "\\") ;
      }

      Application->ProcessMessages () ;
    } while (FindNextFile (filehandle, &filedata)) ;
    FindClose (filehandle) ;
  }

  Application->ProcessMessages () ;
}
//---------------------------------------------------------------------------
void __fastcall TForm1::SpeichernBtnClick(TObject *Sender)
{
  SaveDialog->Title = "Save As";
  if(!SaveDialog->Execute())return;
  SpeichernBtn->Enabled=false;
  if(PageControl1->ActivePage==TabSheet1)
      ListBox1->Items->SaveToFile(SaveDialog->FileName);
  else
      ListBox2->Items->SaveToFile(SaveDialog->FileName);
  SpeichernBtn->Enabled=true;
}
//---------------------------------------------------------------------------
void __fastcall TForm1::TextExClick(TObject *Sender)
{
BOOL Leerzeichen,Paragraph,Script;
const int maxEnts=100,minEntCount=2,maxEntCount=6;
int i,k,m,n,p,LineCount=0,DateiCount=0,KlammerAuf,iByte;
String Str,EntStr,TxtStr;
String EntList[maxEnts]={"lt","gt","amp","yen","uml","not","shy","reg","deg","ETH","eth",
      "auml","ouml","uuml","Auml","Ouml","Uuml","quot","nbsp","cent","sect","copy",
      "ordf","macr","sup2","sup3","para","sup1","ordm","Euml","Iuml","euml","iuml",
      "yuml","szlig","iexcl","pound","laquo","acute","micro","cedil","raquo",
      "Acirc","Aring","AElig","Ecirc","Icirc","Ocirc","times","Ucirc",
      "THORN","acirc","aring","aelig","ecirc","icirc","ocirc","ucirc","thorn",
      "curren","brvbar","plusmn","middot","frac14","frac12","frac34","iquest","Agrave",
      "Aacute","Atilde","Ccedil","Egrave","Eacute","Igrave","Iacute","Ntilde",
      "Ograve","Oacute","Otilde","Oslash","Ugrave","Uacute","Yacute","agrave",
      "aacute","atilde","ccedil","egrave","eacute","igrave","iacute","ntilde",
      "ograve","oacute","otilde","divide","oslash","ugrave","uacute","yacute"};
BYTE UmlList[maxEnts]={60,62,38,165,168,172,173,174,176,208,240,
      228,246,252,196,214,220,34,160,162,167,169,
      170,175,178,179,182,185,186,203,207,235,239,255,
      223,161,163,171,180,181,184,187,
      194,197,198,202,206,212,215,219,
      222,226,229,230,234,238,244,251,254,
      164,166,177,183,188,189,190,191,192,
      193,195,199,200,201,204,205,209,
      210,211,213,216,217,218,221,224,
      225,227,231,232,233,236,237,241,
      242,243,245,247,248,249,250,253};
int EntStart[]={0,2,11,34,59,maxEnts};
char b,c;
  ListBox2->Visible=false;
  ListBox2->Clear();
  PageControl1->ActivePage=TabSheet2;
  SpeichernBtn->Enabled=false;
  ZeigeFortschritt("Text Extrahieren");
  for(i=0; i<ListBox1->Items->Count; i++)
  {
    if(ListBox1->SelCount && (DateiCount>=ListBox1->SelCount))break;
    if(ListBox1->SelCount && !ListBox1->Selected[i])continue;
    DateiCount++;
    Str= ListBox1->Items->Strings[i];
    ListBox3->Clear();
    ListBox3->Items->LoadFromFile(Str);
    KlammerAuf=0;
    TxtStr="//------Datei: "; TxtStr=TxtStr+Str;
    int StrLen=TxtStr.Length();
    if(StrLen<79)TxtStr=TxtStr+AnsiString::StringOfChar('-',79-StrLen);
    EntStr="";
    Paragraph=false;
    for(k=0; k<ListBox3->Items->Count; k++)
    {
      Application->ProcessMessages () ;
      if(TxtStr.Length())
      {
        TxtStr=TxtStr+EntStr;
        if(!Paragraph)
        {
          ListBox2->Items->Append(TxtStr+EntStr);
          TxtStr="";LineCount++;
        }
        else TxtStr=TxtStr+' ';
      }
      if((LineCount & 7)==0)InkrementFortschritt();
      EntStr="";
      Leerzeichen=false;
      Str=ListBox3->Items->Strings[k];
      if(Str.Pos("<SCRIPT")){Script=true; continue;}
      if(Str.Pos("</SCRIPT")){Script=false; ShowMessage("Script");continue;}
      for(m=1; m<= Str.Length(); m++)
      {
        b=Str[m];
        if(KlammerAuf)
        {
//          if((b=='>')&& !Script){ KlammerAuf=0; continue; }
          if(b=='>'){ KlammerAuf=0; continue; }
        }
//        if((b=='<') && !Script)
        if(b=='<')
        {
          KlammerAuf++;
          if(FliessText->Checked)
          {
            b=Str[m+1];
            if((b=='p')||(b=='P'))Paragraph=true;
            else if(b=='/')
            {
              b=Str[m+2];
              if((b=='p')||(b=='P'))Paragraph=false;
            }
          }
          continue;
        }
        if(b=='&')
        {
          b=0;
          if(Str[m+1]=='#')
          {
            for(n=m+2; n<=Str.Length(); n++)
            {
              if(n>m+4)break;
              c=Str[n];
              if(c>='0' && c<='9')EntStr=EntStr+c;
              else break;
            }
            iByte=EntStr.ToIntDef(0);
            if(iByte>=32)
            {
              b=(char)iByte;
              if(Str[n]==';')m++;
            }
            else{TxtStr=TxtStr+"&#";TxtStr=TxtStr+EntStr;}
            m+=1+EntStr.Length(); EntStr="";
          }
          else
          {
            iByte=0;
            for(n=m+1; n<=Str.Length(); n++)
            {
              EntStr=EntStr+Str[n];
              if(n>=m+minEntCount)
              {
                int nStart=EntStart[EntStr.Length()-2];
                int nStop= EntStart[EntStr.Length()-1];
                for(p=nStart; p<nStop; p++)
                  if(EntStr== EntList[p])
                  {
                    b=UmlList[p];
                    iByte++;break;
                  }
              }
              if(iByte || (n>=m+1+maxEntCount))break;
            }
            if(!iByte){TxtStr=TxtStr+'&';TxtStr=TxtStr+EntStr;}
            m=n; if(Str[m+1]==';')m++;EntStr="";
          }
        }
        if((Byte)b==160)b=' '; //nbsp
        if(b==' ')
        {
          if(!TxtStr.Length()||Leerzeichen)continue;
          else{TxtStr=TxtStr+b; Leerzeichen=true;}
        }
        else if((Byte)b>' ')
        {
          TxtStr=TxtStr+b;
          Leerzeichen=false;
        }
      }
    }
  }
  SpeichernBtn->Enabled=true;
  Form2->Close();
  /*
  QueryPerformanceCounter(&LI);
  Stop=LI.u.LowPart;
  Diff=(Stop-Start)*1000/Freq;
  ltoa(Diff,LI_Str,10);
  ShowMessage(LI_Str);
  */
  ListBox2->Visible=true;
}
//---------------------------------------------------------------------------
void TForm1::ZeigeFortschritt(const String Titel)
{
  Form2->Left =Form1->Left+Form1->Width/2-Form2->Width/2;
  Form2->Top = Form1->Top+Form1->Height/2-Form2->Height/2;
  Form2->ProgressBar1->Position=Form2->ProgressBar1->Min;
  Form2->Caption=Titel;
  Form2->Show();
}
//---------------------------------------------------------------------------
void TForm1::InkrementFortschritt(void)
{
  Form2->ProgressBar1->Position++;
  if(Form2->ProgressBar1->Position==Form2->ProgressBar1->Max)
  Form2->ProgressBar1->Position=Form2->ProgressBar1->Min;
}
//---------------------------------------------------------------------------
void __fastcall TForm1::HandleError(TObject *Sender, Exception *EE)
{
  ShowMessage("Laufwerk-Fehler");
}
//---------------------------------------------------------------------------
void __fastcall TForm1::FormDestroy(TObject *Sender)
{
  Application->OnException = NULL;
}

//---------------------------------------------------------------------------

