00001
00002
00003
00004
00005
00006 #include "stdafx.h"
00007 #include "FdmApp.h"
00008 #include "fsWebPageDownloader.h"
00009 #include "DownloadsWnd.h"
00010 #include "inetutil.h"
00011 #include "ShedulerWnd.h"
00012
00013 extern CShedulerWnd *_pwndScheduler;
00014
00015 #ifdef _DEBUG
00016 #undef THIS_FILE
00017 static char THIS_FILE[]=__FILE__;
00018 #define new DEBUG_NEW
00019 #endif
00020
00021 extern CDownloadsWnd *_pwndDownloads;
00022
00023 fsWebPageDownloader::fsWebPageDownloader()
00024 {
00025 m_pfnEvents = NULL;
00026 ReadDefaultWPDS (&m_wpds);
00027 InitializeCriticalSection (&m_cs_Done_Redir_Events);
00028 m_nMaxID = 0;
00029 m_bStopped = FALSE;
00030 m_bIsDeleting = FALSE;
00031 m_bWasShutdownMsg = FALSE;
00032 }
00033
00034 fsWebPageDownloader::~fsWebPageDownloader()
00035 {
00036 for (size_t i = 0; i < m_vConfs.size (); i++)
00037 {
00038 SAFE_DELETE (m_vConfs [i].wp->pvUnpLinks);
00039 SAFE_DELETE (m_vConfs [i].wp->pvUrls);
00040 }
00041
00042 for (i = 0; i < (size_t)m_wpds.vIgnoreList.size (); i++)
00043 delete m_wpds.vIgnoreList [i];
00044
00045 DeleteCriticalSection (&m_cs_Done_Redir_Events);
00046 }
00047
00048 BOOL fsWebPageDownloader::Create(LPCSTR pszPageURL, BOOL bAutoStart, fsSchedule *task)
00049 {
00050 if (AddPage (NULL, pszPageURL, WPDPT_PAGE, TRUE, bAutoStart, task))
00051 {
00052 _Snds.Event (SME_DOWNLOADADDED);
00053 return TRUE;
00054 }
00055
00056 return FALSE;
00057 }
00058
00059 fsDLWebPage* fsWebPageDownloader::AddPage(fs::ListTree <fsDLWebPage> *root, LPCSTR pszPageURL, vmsWPDPageType enPageType, BOOL bSetCTReq, BOOL bAutoStart, fsSchedule *task)
00060 {
00061 fsDLWebPage wp;
00062 vmsDownloadSmartPtr dld;
00063 Download_CreateInstance (dld);
00064
00065 if (m_bIsDeleting)
00066 return NULL;
00067
00068 if (enPageType != WPDPT_PAGE)
00069 bSetCTReq = FALSE;
00070
00071 LPSTR pszWA;
00072
00073 if (fsIsAnchorInUrl (pszPageURL, &pszWA))
00074 {
00075
00076 wp.strURL = pszWA;
00077 delete [] pszWA;
00078 }
00079 else
00080 wp.strURL = pszPageURL;
00081
00082 if (bSetCTReq)
00083 dld->dwFlags |= DLD_CTREQ_HTML;
00084 else if (enPageType == WPDPT_FILE)
00085 dld->dwFlags |= DLD_NOTIFYBEFOREDOWNLOAD;
00086
00087
00088 if (IR_SUCCESS != dld->pMgr->GetDownloadMgr ()->CreateByUrl (wp.strURL, TRUE))
00089 return NULL;
00090
00091
00092 fsDownload_NetworkProperties *dnp = dld->pMgr->GetDownloadMgr ()->GetDNP ();
00093 if (*dnp->pszUserName == 0 && m_wpds.strUserName.Length ())
00094 {
00095 SAFE_DELETE_ARRAY (dnp->pszUserName);
00096 SAFE_DELETE_ARRAY (dnp->pszPassword);
00097
00098 fsnew (dnp->pszUserName, char, m_wpds.strUserName.Length () + 1);
00099 fsnew (dnp->pszPassword, char, m_wpds.strPassword.Length () + 1);
00100 strcpy (dnp->pszUserName, m_wpds.strUserName);
00101 strcpy (dnp->pszPassword, m_wpds.strPassword);
00102 }
00103
00104 if (root == NULL)
00105 m_strStartServer = dld->pMgr->GetDownloadMgr ()->GetDNP ()->pszServerName;
00106
00107 char szAddPath [10000];
00108 *szAddPath = 0;
00109
00110 if ((m_wpds.dwFlags & WPDF_KEEPFOLDERSTRUCTURE) && *dnp->pszPathName)
00111 {
00112 char szFilePath [10000];
00113 char szPath [10000];
00114
00115 fsFilePathFromUrlPath (dnp->pszPathName+1, dnp->enProtocol == NP_FTP,
00116 TRUE, szFilePath, sizeof (szFilePath));
00117
00118 fsGetPath (szFilePath, szPath);
00119 fsPathToGoodPath (szPath);
00120
00121 if (strcmp (szPath, "\\") == 0)
00122 *szPath = 0;
00123
00124 lstrcpy (szAddPath, m_strStartServer);
00125 lstrcat (szAddPath, "\\");
00126 lstrcat (szAddPath, szPath);
00127 }
00128
00129 fsnew (dld->pMgr->GetDownloadMgr ()->GetDP ()->pszFileName, char, m_wpds.strFolderSaveTo.Length () + strlen (szAddPath) + 1);
00130 LPSTR pszFile = dld->pMgr->GetDownloadMgr ()->GetDP ()->pszFileName;
00131 lstrcpy (pszFile, m_wpds.strFolderSaveTo);
00132 lstrcat (pszFile, szAddPath);
00133
00134 ApplySettingsToDld (dld);
00135 dld->bAutoStart = m_bStopped ? FALSE : bAutoStart;
00136 if (root != NULL)
00137 dld->dwFlags |= DLD_DONTPUTTOHISTORY;
00138 dld->dwFlags |= DLD_NOTIFICATIONS_LL;
00139
00140 if ((m_wpds.dwFlags & WPDF_DONTSTOREPAGES) == 0)
00141 {
00142
00143 if (FALSE == dld->pMgr->GetDownloadMgr ()->InitFile (TRUE, enPageType == WPDPT_PAGE && m_wpds.bSavePagesUnderHTM ? "htm" : NULL))
00144 return NULL;
00145 }
00146
00147 wp.bState = 0;
00148 wp.nID = ++m_nMaxID;
00149 fsnew1 (wp.pvUnpLinks, fs::list <_WP_UnprocessedLinks>);
00150 fsnew1 (wp.pvUrls, fs::list <fsString>);
00151
00152 wp.dld = dld;
00153
00154 return AddWebPage (&wp, root, task);
00155 }
00156
00157 void fsWebPageDownloader::ApplySettingsToDld(vmsDownloadSmartPtr dld)
00158 {
00159 ASSERT (dld->pMgr->GetDownloadMgr () != NULL);
00160
00161 dld->pGroup = m_wpds.pDLGroup;
00162 dld->bAutoStart = m_bStopped == FALSE;
00163
00164 *dld->pMgr->GetDownloadMgr ()->GetDP ()->pszAdditionalExt = 0;
00165
00166 dld->pMgr->GetDownloadMgr ()->GetDP ()->enAER = AER_RENAME;
00167 if (m_wpds.dwFlags & WPDF_DELCOMPLETEDDLDS)
00168 dld->dwFlags |= DLD_DELETEWHENDONE;
00169 dld->dwFlags |= DLD_NOAUTOLAUNCH | DLD_DONTSHOWDIALOG;
00170 dld->pMgr->GetDownloadMgr ()->GetDP ()->dwFlags &= ~DPF_GENERATEDESCFILE;
00171 dld->dwFlags |= DLD_USEDBYHTMLSPIDER;
00172 dld->pMgr->GetDownloadMgr ()->GetDP ()->bCheckIntegrityWhenDone = FALSE;
00173 }
00174
00175 DWORD fsWebPageDownloader::_DldEvents(fsDownload* dld, enum fsDownloadsMgrEvent ev, LPVOID lp)
00176 {
00177 fsWebPageDownloader* pThis = (fsWebPageDownloader*) lp;
00178
00179 try {
00180
00181 switch (ev)
00182 {
00183 case DME_DOWNLOADSTOPPEDORDONE:
00184 if (pThis->m_bWasShutdownMsg == FALSE)
00185 {
00186 if (pThis->IsDone ())
00187 {
00188 pThis->m_bWasShutdownMsg = TRUE;
00189 pThis->Event (WPDE_DONE);
00190 }
00191 else if (pThis->IsRunning () == FALSE)
00192 {
00193 pThis->m_bWasShutdownMsg = TRUE;
00194 pThis->Event (WPDE_STOPPED);
00195 }
00196 }
00197 break;
00198
00199 case DME_DOWNLOADEREVENTRECEIVED:
00200 if (dld->pMgr->IsRunning () == FALSE && dld->pMgr->IsDone ())
00201 {
00202
00203 EnterCriticalSection (&pThis->m_cs_Done_Redir_Events);
00204
00205 try {
00206 pThis->OnWPDownloadDone (dld);
00207 }
00208 catch (...) {}
00209 LeaveCriticalSection (&pThis->m_cs_Done_Redir_Events);
00210 }
00211 break;
00212
00213 case DME_DOWNLOADWILLBEDELETED:
00214 {
00215 fsDLWebPage* wp = pThis->FindWebPage (dld);
00216
00217 try {
00218 if (wp)
00219 {
00220 wp->strFile = wp->dld->pMgr->get_OutputFilePathName ();
00221 if (dld->pMgr->IsDone () == FALSE)
00222 wp->bState |= WPSTATE_DLDWASDELETED;
00223
00224 pThis->Event (WPDE_DLDWILLBEDELETED, (int)(fsDownload*) dld);
00225 wp->dld = NULL;
00226 }
00227 } catch (...) {}
00228 dld = NULL;
00229 }
00230 break;
00231
00232 case DME_DLDRESTORED:
00233 {
00234 fsDLWebPage* wp = pThis->FindWebPage (dld);
00235 if (wp == NULL)
00236 return FALSE;
00237
00238 pThis->WebPage_FindDownload (wp);
00239 wp->bState &= ~WPSTATE_DLDWASDELETED;
00240
00241 wp->dld->pfnDownloadEventsFunc = _DldEvents;
00242 wp->dld->lpEventsParam = pThis;
00243
00244 pThis->Event (WPDE_DLDRESTORED, (int) dld);
00245
00246 return TRUE;
00247 }
00248
00249 case DME_DLDWILLBEFULLYDELETED:
00250 {
00251 fsDLWebPage* wp = pThis->FindWebPage (dld);
00252
00253 try {
00254 if (wp)
00255 {
00256 try {
00257 if (dld->pMgr->IsDone () == FALSE)
00258 pThis->CorrectUnpUrls (wp, NULL);
00259 }catch (...) {}
00260
00261 wp->strFile = wp->dld->pMgr->get_OutputFilePathName ();
00262 wp->uDldId = UINT (-1);
00263 wp->dld = NULL;
00264
00265 SAFE_DELETE (wp->pvUnpLinks);
00266 }
00267 } catch (...) {}
00268 dld = NULL;
00269 }
00270 break;
00271
00272 case DME_DOWNLOADWASDELETEDFROMLIST:
00273 dld = NULL;
00274 break;
00275
00276 case DME_REDIRECTED:
00277
00278 EnterCriticalSection (&pThis->m_cs_Done_Redir_Events);
00279 try {
00280 pThis->OnDldRedirected (dld);
00281 }catch (...) {}
00282 LeaveCriticalSection (&pThis->m_cs_Done_Redir_Events);
00283 dld = NULL;
00284 break;
00285
00286 case DME_BEFOREDOWNLOADING:
00287 return pThis->OnCheckFileExtIsOK (dld);
00288 }
00289
00290 }
00291 catch (...) {}
00292
00293 if (dld)
00294 pThis->Event (WPDE_DLDEVENTRECEIVED, (int) dld);
00295
00296 return TRUE;
00297 }
00298
00299 fsDLWebPage* fsWebPageDownloader::AddWebPage(fsDLWebPage *wp, fs::ListTree <fsDLWebPage> *root, fsSchedule *task)
00300 {
00301 if (m_bIsDeleting)
00302 return NULL;
00303
00304 fs::ListTree <fsDLWebPage>* wptree = &m_pages;
00305
00306 fsDLWebPage *wpadded = NULL;
00307
00308 wp->dld->pfnDownloadEventsFunc = _DldEvents;
00309 wp->dld->lpEventsParam = this;
00310
00311 _pwndDownloads->CreateDownload (wp->dld, task, TRUE);
00312
00313 wp->uDldId = wp->dld->nID;
00314
00315 WebPage_FindDownload (wp);
00316
00317 if (root)
00318 {
00319 if (m_bIsDeleting == FALSE)
00320 {
00321 wptree = root->AddLeaf (*wp);
00322 wpadded = &root->GetLeaf (root->GetLeafCount () - 1)->GetData ();
00323 }
00324 }
00325 else
00326 {
00327 m_pages.SetData (*wp);
00328 wpadded = &m_pages.GetData ();
00329 }
00330
00331 if (wpadded)
00332 {
00333 _Conformity conf;
00334 conf.wptree = wptree;
00335 conf.wp = wpadded;
00336 m_vConfs.push_back (conf);
00337 }
00338
00339 Event (WPDE_FILEADDED, (int) root);
00340
00341 return wpadded;
00342 }
00343
00344 void fsWebPageDownloader::WebPage_FindDownload(fsDLWebPage *wp)
00345 {
00346 wp->dld = _DldsMgr.GetDownloadByID (wp->uDldId);
00347 }
00348
00349 fsDLWebPage* fsWebPageDownloader::FindWebPage(vmsDownloadSmartPtr dld)
00350 {
00351 t_wptree wptree = FindWebPageTree (dld);
00352 return wptree ? &wptree->GetData () : NULL;
00353 }
00354
00355 fs::ListTree <fsDLWebPage>* fsWebPageDownloader::FindWebPageTree(vmsDownloadSmartPtr dld)
00356 {
00357 for (size_t i = 0; i < m_vConfs.size (); i++)
00358 {
00359 if (m_vConfs [i].wp->uDldId == dld->nID)
00360 return m_vConfs [i].wptree;
00361 }
00362
00363 return NULL;
00364 }
00365
00366 void fsWebPageDownloader::OnWPDownloadDone(vmsDownloadSmartPtr dld)
00367 {
00368 if (dld == NULL)
00369 return;
00370
00371 ASSERT (dld->pMgr->GetDownloadMgr () != NULL);
00372 if (dld->pMgr->GetDownloadMgr () == NULL)
00373 return;
00374
00375 t_wptree wptree = FindWebPageTree (dld);
00376
00377 if (wptree == NULL)
00378 return;
00379
00380 SAFE_DELETE (wptree->GetData ().pvUnpLinks);
00381
00382 char szFile [MY_MAX_PATH];
00383 if (fsFileNameFromUrlPath (dld->pMgr->GetDownloadMgr ()->GetDNP ()->pszPathName,
00384 dld->pMgr->GetDownloadMgr ()->GetDNP ()->enProtocol == NP_FTP,
00385 TRUE, szFile, sizeof (szFile)))
00386 {
00387 LPCSTR pszExt = strrchr (szFile, '.');
00388 if (pszExt == NULL || IsExtInExtsStr (m_wpds.strHTMLExts, pszExt+1) || *szFile == 0)
00389 {
00390 dld->pMgr->GetDownloadMgr ()->CloseFile ();
00391
00392 if (m_wpds.dwFlags & WPDF_DONTSTOREPAGES)
00393 dld->dwFlags |= DLD_DELETEWHENDONE | DLD_DELETEFILEALWAYS |
00394 DLD_DONTPUTTOHISTORY | DLD_DONTPUTTORECYCLE;
00395
00396 ParseHTMLFile (wptree, wptree->GetDepth () > m_wpds.iDepth);
00397 }
00398 }
00399 }
00400
00401 int fsWebPageDownloader::FindConfIndex(vmsDownloadSmartPtr dld)
00402 {
00403 for (size_t i = 0; i < m_vConfs.size (); i++)
00404 {
00405 if (m_vConfs [i].wp->dld == dld)
00406 return i;
00407 }
00408
00409 return -1;
00410 }
00411
00412 void fsWebPageDownloader::ParseHTMLFile(t_wptree wptree, BOOL bFixUrlsOnly)
00413 {
00414 fsDLWebPage *wp = &wptree->GetData ();
00415
00416 if (wp->bState & WPSTATE_PAGEPROCESSED)
00417 return;
00418
00419 HANDLE hFile;
00420
00421 ASSERT (wp->dld == NULL || wp->dld->pMgr->GetDownloadMgr () != NULL);
00422 if (wp->dld != NULL && wp->dld->pMgr->GetDownloadMgr () == NULL)
00423 return;
00424
00425 do
00426 {
00427 hFile = CreateFile (wp->dld ? wp->dld->pMgr->get_OutputFilePathName () : wp->strFile,
00428 GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ,
00429 NULL, OPEN_EXISTING, 0, NULL);
00430
00431 if (hFile == INVALID_HANDLE_VALUE)
00432 {
00433
00434 return;
00435 }
00436 }
00437 while (hFile == INVALID_HANDLE_VALUE);
00438
00439 DWORD dwSize = GetFileSize (hFile, NULL);
00440 LPSTR pszHTML;
00441 fsnew (pszHTML, char, int (dwSize*1.5) + 50000 + 1);
00442
00443 UINT newlen = 0;
00444 if (ReadFile (hFile, pszHTML, dwSize, &dwSize, NULL))
00445 {
00446 pszHTML [dwSize] = 0;
00447
00448 newlen = ParseHTML (pszHTML, wptree, bFixUrlsOnly);
00449
00450 if ((m_wpds.dwFlags & WPDF_DONTSTOREPAGES) == 0)
00451 {
00452
00453 SetFilePointer (hFile, 0, NULL, FILE_BEGIN);
00454 SetEndOfFile (hFile);
00455 DWORD dw;
00456 WriteFile (hFile, pszHTML, newlen, &dw, NULL);
00457 }
00458 }
00459
00460 delete [] pszHTML;
00461 CloseHandle (hFile);
00462
00463 wp->bState |= WPSTATE_PAGEPROCESSED;
00464 }
00465
00466 UINT fsWebPageDownloader::ParseHTML(LPCSTR pszHTML, t_wptree wptree, BOOL bFixUrlsOnly)
00467 {
00468 if ((m_wpds.dwFlags & WPDF_DONTSTOREPAGES) && bFixUrlsOnly)
00469 return 0;
00470
00471 fsHTMLParser html;
00472
00473 html.SetKillDupes (FALSE);
00474
00475 html.ParseHTML ((LPSTR)pszHTML);
00476
00477 char szBaseURL [10000];
00478 LPCSTR pszBaseURL = html.Get_BaseURL ();
00479 if (*pszBaseURL == 0)
00480 pszBaseURL = wptree->GetData ().strURL;
00481 else
00482 {
00483 fsURL url;
00484 if (url.Crack (pszBaseURL) != IR_SUCCESS)
00485 {
00486 lstrcpy (szBaseURL, "http://");
00487 lstrcat (szBaseURL, pszBaseURL);
00488 if (url.Crack (szBaseURL) == IR_SUCCESS)
00489 {
00490 if (szBaseURL [strlen (szBaseURL)-1] != '/' && szBaseURL [strlen (szBaseURL)-1] != '\\')
00491 lstrcat (szBaseURL, "/");
00492
00493 pszBaseURL = szBaseURL;
00494 }
00495 else
00496 pszBaseURL = wptree->GetData ().strURL;
00497 }
00498 else
00499 {
00500 if (pszBaseURL [strlen (pszBaseURL)-1] != '/' && pszBaseURL [strlen (pszBaseURL)-1] != '\\')
00501 {
00502 lstrcpy (szBaseURL, pszBaseURL);
00503 lstrcat (szBaseURL, "/");
00504 pszBaseURL = szBaseURL;
00505 }
00506 }
00507 }
00508
00509 BOOL bAdded = FALSE;
00510 int cAdded;
00511
00512 cAdded = ParseHTMLFrameUrls (html, wptree, bFixUrlsOnly, pszBaseURL);
00513 bAdded = bAdded || cAdded != 0;
00514
00515 cAdded = ParseHTMLUrls (html, wptree, bFixUrlsOnly, pszBaseURL);
00516 bAdded = bAdded || cAdded != 0;
00517
00518 if (m_wpds.bDownloadStyles)
00519 {
00520 cAdded = ParseHTMLLinkUrls (html, wptree, bFixUrlsOnly, pszBaseURL);
00521 bAdded = bAdded || cAdded != 0;
00522 }
00523
00524 if (m_wpds.bDownloadImages)
00525 {
00526 cAdded = ParseHTMLImages (html, wptree, bFixUrlsOnly, pszBaseURL);
00527 bAdded = bAdded || cAdded != 0;
00528 }
00529
00530 if (*html.Get_BaseURL ())
00531 html.RemoveBaseTag ();
00532
00533 if (bAdded)
00534 _Snds.Event (SME_DOWNLOADADDED);
00535
00536 return html.GetHTMLLength ();
00537 }
00538
00539 int fsWebPageDownloader::ParseHTMLUrls(fsHTMLParser &parser, t_wptree wptree, BOOL bFixUrlsOnly, LPCSTR pszBaseURL)
00540 {
00541 int cAdded = 0;
00542
00543 for (int i = 0; i < parser.GetUrlCount (); i ++)
00544 {
00545 LPCSTR pszUrl = parser.GetUrl (i);
00546
00547 if (*pszUrl == '#')
00548 continue;
00549
00550 LPSTR pszFullUrl;
00551
00552 fsUrlToFullUrl (pszBaseURL, pszUrl, &pszFullUrl);
00553 if (pszFullUrl == NULL)
00554 continue;
00555
00556 fsURL url;
00557 if (IR_SUCCESS != url.Crack (pszFullUrl, TRUE))
00558 {
00559 delete [] pszFullUrl;
00560 continue;
00561 }
00562
00563 if (*url.GetHostName () == 0)
00564 {
00565 delete [] pszFullUrl;
00566 continue;
00567 }
00568
00569 LPCSTR pszPath = url.GetPath ();
00570
00571 char szFile [MY_MAX_PATH];
00572 fsFileNameFromUrlPath (pszPath, url.GetInternetScheme () == INTERNET_SCHEME_FTP,
00573 TRUE, szFile, sizeof (szFile));
00574
00575 LPCSTR pszExt = strrchr (szFile, '.');
00576
00577 BOOL bExt = FALSE;
00578 BOOL bWebPage = FALSE;
00579
00580 if (pszExt++)
00581 {
00582 bExt = IsExtInExtsStr (m_wpds.strExts, pszExt);
00583 bWebPage = IsExtInExtsStr (m_wpds.strHTMLExts, pszExt);
00584 }
00585 else
00586 {
00587 bWebPage = TRUE;
00588 }
00589
00590 if (*pszPath == 0 || pszPath [strlen (pszPath)-1] == '/' || pszPath [strlen (pszPath)-1] == '\\')
00591 bWebPage = TRUE;
00592
00593 if (url.GetInternetScheme () != INTERNET_SCHEME_HTTP && url.GetInternetScheme () != INTERNET_SCHEME_HTTPS)
00594 bWebPage = FALSE;
00595
00596 BOOL bSetCTReq = bWebPage ? TRUE : FALSE;
00597
00598 if (m_wpds.bDownloadFiles == FALSE)
00599 {
00600 if (bWebPage == FALSE)
00601 {
00602
00603 parser.ReplaceUrl (i, pszFullUrl);
00604 delete [] pszFullUrl;
00605 continue;
00606 }
00607 cAdded++;
00608 }
00609 else
00610 bSetCTReq = FALSE;
00611
00612 if (m_wpds.enExtsType != WPDET_OFF)
00613 {
00614 if ( (bExt && m_wpds.enExtsType == WPDET_NOTDOWNLOAD) ||
00615 (bExt == FALSE && m_wpds.enExtsType == WPDET_DOWNLOAD) )
00616 {
00617 if (bWebPage == FALSE)
00618 {
00619
00620 parser.ReplaceUrl (i, pszFullUrl);
00621 delete [] pszFullUrl;
00622 continue;
00623 }
00624 }
00625 }
00626
00627 if ( (m_wpds.bNotAllFiles && bWebPage == FALSE) ||
00628 (m_wpds.bNotAllPages && bWebPage) )
00629 {
00630
00631 if (fsIsServersEqual (url.GetHostName (), m_strStartServer) == FALSE)
00632 {
00633
00634 parser.ReplaceUrl (i, pszFullUrl);
00635 delete [] pszFullUrl;
00636 continue;
00637 }
00638 }
00639
00640 if (IsURLShouldBeIgnored (url))
00641 {
00642 parser.ReplaceUrl (i, pszFullUrl);
00643 SAFE_DELETE_ARRAY (pszFullUrl);
00644 continue;
00645 }
00646
00647 LPSTR pszWA = NULL, pszFA = NULL;
00648
00649 if (bWebPage && bFixUrlsOnly == FALSE)
00650 {
00651 if (CrackUrl (pszFullUrl, &pszWA, &pszFA))
00652 {
00653 delete [] pszFullUrl;
00654 pszFullUrl = pszWA;
00655 }
00656 }
00657
00658 fsDLWebPage *wp = FindWebPage (pszFullUrl);
00659
00660 if (wp == NULL && bFixUrlsOnly == FALSE)
00661 {
00662
00663 wp = AddPage (wptree, pszFullUrl, bWebPage ? WPDPT_PAGE : WPDPT_FILE, bSetCTReq);
00664
00665 if (wp == NULL)
00666 {
00667 SAFE_DELETE_ARRAY (pszFA);
00668 parser.ReplaceUrl (i, pszFullUrl);
00669 delete [] pszFullUrl;
00670 continue;
00671 }
00672 }
00673
00674 if (wp && wp->pvUnpLinks)
00675 {
00676 _WP_UnprocessedLinks unplink;
00677 unplink.nWPIDWhere = wptree->GetData ().nID;
00678 unplink.nParserUrl = i;
00679 unplink.lt = WPLT_A;
00680 wp->pvUnpLinks->add (unplink);
00681 }
00682
00683 if ((m_wpds.dwFlags & WPDF_DONTSTOREPAGES) == 0)
00684 {
00685 if (wp)
00686 {
00687
00688 GetFileForReplace (&wptree->GetData (), wp, szFile);
00689 if (pszFA)
00690 strcat (szFile, strchr (pszFA, '#'));
00691
00692
00693 parser.ReplaceUrl (i, szFile);
00694 }
00695 else
00696 {
00697 parser.ReplaceUrl (i, pszFullUrl);
00698 }
00699 }
00700
00701 delete [] pszFullUrl;
00702 }
00703
00704 return cAdded;
00705 }
00706
00707 fsDLWebPage* fsWebPageDownloader::FindWebPage(LPCSTR pszFullUrl)
00708 {
00709 fsURL url1, url2;
00710
00711 if (IR_SUCCESS != url1.Crack (pszFullUrl))
00712 return NULL;
00713
00714
00715 for (size_t i = 0; i < m_vConfs.size (); i ++)
00716 {
00717 fsDLWebPage *wp = m_vConfs [i].wp;
00718
00719
00720
00721 if (IsUrlsEqual (url1, wp->strURL))
00722 return wp;
00723
00724
00725 for (int i = 0; i < wp->pvUrls->size (); i++)
00726 if (IsUrlsEqual (url1, wp->pvUrls->at (i)))
00727 return wp;
00728 }
00729
00730 return NULL;
00731 }
00732
00733 int fsWebPageDownloader::ParseHTMLImages(fsHTMLParser &parser, t_wptree wptree, BOOL bFixUrlsOnly, LPCSTR pszBaseURL)
00734 {
00735 int cAdded = 0;
00736
00737 for (int i = 0; i < parser.GetImageCount (); i ++)
00738 {
00739 LPCSTR pszUrl = parser.GetImage (i);
00740
00741 LPSTR pszFullUrl;
00742 fsUrlToFullUrl (pszBaseURL, pszUrl, &pszFullUrl);
00743 if (pszFullUrl == NULL)
00744 continue;
00745
00746 fsURL url;
00747 if (IR_SUCCESS != url.Crack (pszFullUrl, TRUE))
00748 {
00749 delete [] pszFullUrl;
00750 continue;
00751 }
00752
00753 if (*url.GetHostName () == 0)
00754 {
00755 delete [] pszFullUrl;
00756 continue;
00757 }
00758
00759 char szFile [MY_MAX_PATH];
00760 fsFileNameFromUrlPath (url.GetPath (), url.GetInternetScheme () == INTERNET_SCHEME_FTP,
00761 TRUE, szFile, sizeof (szFile));
00762 LPCSTR pszExt = strrchr (szFile, '.');
00763 BOOL bExt = FALSE;
00764 if (pszExt++)
00765 bExt = IsExtInExtsStr (m_wpds.strImgsExts, pszExt);
00766
00767 if (m_wpds.enImgsExtsType != WPDET_OFF)
00768 {
00769 if ( (bExt && m_wpds.enImgsExtsType == WPDET_NOTDOWNLOAD) ||
00770 (bExt == FALSE && m_wpds.enImgsExtsType == WPDET_DOWNLOAD) )
00771 {
00772 parser.ReplaceImage (i, pszFullUrl);
00773 delete [] pszFullUrl;
00774 continue;
00775 }
00776 }
00777
00778 if (m_wpds.bNotAllImages)
00779 {
00780 int iUrl = parser.GetImageLinkTo (i);
00781 if (iUrl != -1)
00782 {
00783 fsURL url;
00784 LPCSTR pszUrlTo = parser.GetUrl (iUrl);
00785 LPSTR pszFullUrlTo;
00786 fsUrlToFullUrl (wptree->GetData ().strURL, pszUrlTo, &pszFullUrlTo);
00787
00788 if (pszFullUrlTo)
00789 {
00790 url.Crack (pszFullUrlTo);
00791 delete [] pszFullUrlTo;
00792
00793 if (fsIsServersEqual (url.GetHostName (), m_strStartServer) == FALSE)
00794 {
00795 parser.ReplaceImage (i, pszFullUrl);
00796 delete [] pszFullUrl;
00797 continue;
00798 }
00799 }
00800 }
00801 }
00802
00803 fsDLWebPage *wp = FindWebPage (pszFullUrl);
00804
00805 if (wp == NULL && bFixUrlsOnly == FALSE)
00806 {
00807 wp = AddPage (wptree, pszFullUrl, WPDPT_IMAGE);
00808 if (wp == NULL)
00809 {
00810 parser.ReplaceImage (i, pszFullUrl);
00811 delete [] pszFullUrl;
00812 continue;
00813 }
00814 cAdded++;
00815 }
00816
00817 if (wp && wp->pvUnpLinks)
00818 {
00819 _WP_UnprocessedLinks unplink;
00820 unplink.nWPIDWhere = wptree->GetData ().nID;
00821 unplink.nParserUrl = i;
00822 unplink.lt = WPLT_IMG;
00823 wp->pvUnpLinks->add (unplink);
00824 }
00825
00826 if ((m_wpds.dwFlags & WPDF_DONTSTOREPAGES) == 0)
00827 {
00828 if (wp)
00829 {
00830 GetFileForReplace (&wptree->GetData (), wp, szFile);
00831 parser.ReplaceImage (i, szFile);
00832 }
00833 else
00834 {
00835 parser.ReplaceImage (i, pszFullUrl);
00836 }
00837 }
00838
00839 delete [] pszFullUrl;
00840 }
00841
00842 return cAdded;
00843 }
00844
00845 LPCSTR fsWebPageDownloader::GetStartURL()
00846 {
00847 return m_pages.GetData ().strURL;
00848 }
00849
00850 float fsWebPageDownloader::GetPercentDone()
00851 {
00852 float fDone = 0;
00853
00854 int cConfs = m_vConfs.size ();
00855 if (cConfs == 0)
00856 return 0;
00857
00858 for (int i = cConfs - 1; i >= 0; i--)
00859 {
00860 fsDLWebPage *wp = m_vConfs [i].wp;
00861
00862 if (wp->dld && (wp->bState & WPSTATE_DLDWASDELETED) == 0 && wp->dld->pMgr->IsDone () == FALSE)
00863 {
00864 fDone += wp->dld->pMgr->GetPercentDone ();
00865 ASSERT (fDone >= 0);
00866 }
00867 else
00868 fDone += 100;
00869 }
00870
00871 ASSERT (fDone >= 0);
00872
00873 return fDone / m_vConfs.size ();
00874 }
00875
00876 int fsWebPageDownloader::GetFileCount()
00877 {
00878 return m_vConfs.size ();
00879 }
00880
00881 int fsWebPageDownloader::GetDoneFileCount()
00882 {
00883 int cDone = 0;
00884
00885 for (int i = m_vConfs.size () - 1; i >= 0; i--)
00886 {
00887 fsDLWebPage *wp = m_vConfs [i].wp;
00888
00889 if (wp->dld && (wp->bState & WPSTATE_DLDWASDELETED) == 0)
00890 {
00891 if (wp->dld->pMgr->IsDone ())
00892 cDone ++;
00893 }
00894 else
00895 cDone ++;
00896 }
00897
00898 return cDone;
00899 }
00900
00901 BOOL fsWebPageDownloader::IsRunning()
00902 {
00903 try {
00904
00905 for (int i = m_vConfs.size () - 1; i >= 0; i--)
00906 {
00907 if (m_vConfs [i].wp->dld && (m_vConfs [i].wp->bState & WPSTATE_DLDWASDELETED) == 0 &&
00908 m_vConfs [i].wp->dld->pMgr->IsRunning ())
00909 return TRUE;
00910 }
00911
00912 }catch (...) {Sleep (0); return IsRunning ();}
00913
00914 return FALSE;
00915 }
00916
00917 BOOL fsWebPageDownloader::IsDone()
00918 {
00919 return GetDoneFileCount () == GetFileCount ();
00920 }
00921
00922 BOOL fsWebPageDownloader::IsScheduled()
00923 {
00924 if (_pwndScheduler == NULL)
00925 return FALSE;
00926
00927 for (int i = m_vConfs.size () - 1; i >= 0; i--)
00928 {
00929 if (m_vConfs [i].wp->dld && (m_vConfs [i].wp->bState & WPSTATE_DLDWASDELETED) == 0 &&
00930 _pwndScheduler->GetMgr ()->IsDownloadScheduled (m_vConfs [i].wp->dld))
00931 return TRUE;
00932 }
00933
00934 return FALSE;
00935 }
00936
00937 BOOL fsWebPageDownloader::IsOnAutoStart()
00938 {
00939 for (int i = m_vConfs.size () - 1; i >= 0; i--)
00940 {
00941 vmsDownloadSmartPtr dld = m_vConfs [i].wp->dld;
00942
00943 if (dld && (m_vConfs [i].wp->bState & WPSTATE_DLDWASDELETED) == 0 &&
00944 dld->bAutoStart && dld->pMgr->IsDone () == FALSE)
00945 return TRUE;
00946 }
00947
00948 return FALSE;
00949 }
00950
00951 BOOL fsWebPageDownloader::IsDownloading()
00952 {
00953 for (int i = m_vConfs.size () - 1; i >= 0; i--)
00954 {
00955 vmsDownloadSmartPtr dld = m_vConfs [i].wp->dld;
00956 if (dld != NULL)
00957 {
00958 if ((m_vConfs [i].wp->bState & WPSTATE_DLDWASDELETED) == 0 &&
00959 dld->pMgr->IsDownloading ())
00960 {
00961 return TRUE;
00962 }
00963 }
00964 }
00965
00966 return FALSE;
00967 }
00968
00969 BOOL fsWebPageDownloader::CrackUrl(LPCSTR pszFullUrl, LPSTR *ppszUrlWA, LPSTR *ppszFA)
00970 {
00971 LPCSTR pszA;
00972
00973 if (fsIsAnchorInUrl (pszFullUrl, ppszUrlWA, &pszA) == FALSE)
00974 return FALSE;
00975
00976 if (ppszUrlWA && ppszFA)
00977 {
00978 char szFile [10000];
00979 *szFile = 0;
00980 fsFileNameFromUrlPath (*ppszUrlWA, FALSE, FALSE, szFile, sizeof (szFile));
00981
00982 fsnew (*ppszFA, char, strlen (szFile) + strlen (pszA) + 1);
00983
00984 strcpy (*ppszFA, szFile);
00985 strcat (*ppszFA, pszA);
00986 }
00987
00988 return TRUE;
00989 }
00990
00991 fsWPDSettings* fsWebPageDownloader::GetWDPS()
00992 {
00993 return &m_wpds;
00994 }
00995
00996 void fsWebPageDownloader::ReadDefaultWPDS(fsWPDSettings *wpds)
00997 {
00998 wpds->strHTMLExts = _App.Spider_HTMLExts ();
00999 wpds->bNotAllFiles = _App.Spider_NotAllFiles ();
01000 wpds->bNotAllImages = _App.Spider_NotAllImages ();
01001 wpds->bNotAllPages = _App.Spider_NotAllPages ();
01002 wpds->bDownloadFiles = _App.Spider_DownloadFiles ();
01003 wpds->bDownloadImages = _App.Spider_DownloadImages ();
01004 wpds->iDepth = _App.Spider_Depth ();
01005 wpds->strExts = _App.Spider_Exts ();
01006 wpds->enExtsType = _App.Spider_ExtsType ();
01007 wpds->strImgsExts = _App.Spider_ImgExts ();
01008 wpds->enImgsExtsType = _App.Spider_ImgExtsType ();
01009 wpds->bDownloadStyles = _App.Spider_DownloadStyles ();
01010 wpds->bSavePagesUnderHTM = _App.Spider_SavePagesUnderHTM ();
01011 wpds->dwFlags = WPDF_KEEPFOLDERSTRUCTURE | WPDF_DELCOMPLETEDDLDS;
01012 }
01013
01014 int fsWebPageDownloader::ParseHTMLLinkUrls(fsHTMLParser &parser, t_wptree wptree, BOOL bFixUrlsOnly, LPCSTR pszBaseURL)
01015 {
01016
01017 int cAdded = 0;
01018
01019 for (int i = 0; i < parser.GetLinkUrlCount (); i ++)
01020 {
01021 LPCSTR pszUrl = parser.GetLinkUrl (i);
01022 fsLinkRelType lrt = parser.GetLinkUrlRelType (i);
01023
01024 if (lrt != LRT_STYLESHEET)
01025 continue;
01026
01027 LPSTR pszFullUrl;
01028 fsUrlToFullUrl (pszBaseURL, pszUrl, &pszFullUrl);
01029 if (pszFullUrl == NULL)
01030 continue;
01031
01032 fsURL url;
01033 url.Crack (pszFullUrl);
01034
01035 if (*url.GetHostName () == 0)
01036 {
01037 delete [] pszFullUrl;
01038 continue;
01039 }
01040
01041 char szFile [MY_MAX_PATH];
01042 fsFileNameFromUrlPath (url.GetPath (), url.GetInternetScheme () == INTERNET_SCHEME_FTP,
01043 TRUE, szFile, sizeof (szFile));
01044
01045 fsDLWebPage *wp = FindWebPage (pszFullUrl);
01046
01047 if (wp == NULL && bFixUrlsOnly == NULL)
01048 {
01049 wp = AddPage (wptree, pszFullUrl, WPDPT_CSS);
01050 if (wp == NULL)
01051 {
01052 parser.ReplaceLinkUrl (i, pszFullUrl);
01053 delete [] pszFullUrl;
01054 continue;
01055 }
01056 cAdded++;
01057 }
01058
01059 if (wp && wp->pvUnpLinks)
01060 {
01061 _WP_UnprocessedLinks unplink;
01062 unplink.nWPIDWhere = wptree->GetData ().nID;
01063 unplink.nParserUrl = i;
01064 unplink.lt = WPLT_STYLESHEET;
01065 wp->pvUnpLinks->add (unplink);
01066 }
01067
01068 if ((m_wpds.dwFlags & WPDF_DONTSTOREPAGES) == 0)
01069 {
01070 if (wp)
01071 {
01072
01073 GetFileForReplace (&wptree->GetData (), wp, szFile);
01074 parser.ReplaceLinkUrl (i, szFile);
01075 }
01076 else
01077 {
01078 parser.ReplaceLinkUrl (i, pszFullUrl);
01079 }
01080 }
01081
01082 delete [] pszFullUrl;
01083 }
01084
01085 return cAdded;
01086 }
01087
01088 int fsWebPageDownloader::GetDownloadCount()
01089 {
01090 return m_vConfs.size ();
01091 }
01092
01093 vmsDownloadSmartPtr fsWebPageDownloader::GetDownload(int iIndex)
01094 {
01095 return m_vConfs [iIndex].wp->dld;
01096 }
01097
01098 void fsWebPageDownloader::StartDownloading()
01099 {
01100 m_bStopped = FALSE;
01101 m_bWasShutdownMsg = FALSE;
01102
01103 DLDS_LIST vDlds;
01104 for (size_t i = 0; i < m_vConfs.size (); i++)
01105 {
01106 vmsDownloadSmartPtr dld = m_vConfs [i].wp->dld;
01107 if (dld == NULL || dld->pMgr->IsDone ())
01108 continue;
01109 vDlds.push_back (dld);
01110 }
01111
01112 _DldsMgr.StartDownloads (vDlds, TRUE);
01113 }
01114
01115 void fsWebPageDownloader::StopDownloading()
01116 {
01117 m_bStopped = TRUE;
01118
01119 DLDS_LIST vDlds;
01120
01121 size_t sizeWas = m_vConfs.size ();
01122
01123 for (size_t i = 0; i < sizeWas; i++)
01124 {
01125 vmsDownloadSmartPtr dld = m_vConfs [i].wp->dld;
01126 if (dld == NULL)
01127 continue;
01128 vDlds.push_back (dld);
01129 }
01130
01131 if (vDlds.size () == 0)
01132 return;
01133
01134 _DldsMgr.StopDownloads (vDlds, TRUE);
01135
01136 for (i = 0; i < vDlds.size (); i++)
01137 _pwndDownloads->UpdateDownload (vDlds [i]);
01138
01139 if ((size_t)m_vConfs.size () > sizeWas)
01140 StopDownloading ();
01141 }
01142
01143 void fsWebPageDownloader::SetAutoStartDownloading(BOOL b)
01144 {
01145 if (b)
01146 m_bStopped = FALSE;
01147
01148 for (size_t i = 0; i < m_vConfs.size (); i++)
01149 {
01150 vmsDownloadSmartPtr dld = m_vConfs [i].wp->dld;
01151 if (dld && dld->pMgr->IsDone () == FALSE)
01152 {
01153 dld->bAutoStart = b;
01154 _pwndDownloads->UpdateDownload (dld);
01155 }
01156 }
01157
01158 _DldsMgr.ProcessDownloads ();
01159 }
01160
01161 void fsWebPageDownloader::SetEventFunc(fntWPDEvents pfn, LPVOID lp)
01162 {
01163 m_pfnEvents = pfn;
01164 m_lpEventsParam = lp;
01165 }
01166
01167 void fsWebPageDownloader::Event(fsWPDEvent ev, int info)
01168 {
01169 try {
01170 if (m_pfnEvents)
01171 m_pfnEvents (this, ev, info, m_lpEventsParam);
01172 }
01173 catch (...) {}
01174 }
01175
01176 void fsWebPageDownloader::DeleteAllDownloads(BOOL bByUser)
01177 {
01178 DLDS_LIST vDlds;
01179
01180 m_bIsDeleting = TRUE;
01181 m_bStopped = TRUE;
01182
01183 StopDownloading ();
01184
01185 for (size_t i = 0; i < m_vConfs.size (); i++)
01186 {
01187 vmsDownloadSmartPtr dld = m_vConfs [i].wp->dld;
01188 if (dld)
01189 vDlds.push_back (dld);
01190 }
01191
01192 if (vDlds.size () == 0)
01193 return;
01194
01195 size_t cDeleted = _DldsMgr.DeleteDownloads (vDlds, bByUser, FALSE);
01196
01197 for (i = 0; i < m_vConfs.size () && cDeleted; i++)
01198 {
01199 if (m_vConfs [i].wp->dld)
01200 {
01201 m_vConfs [i].wp->dld = NULL;
01202 cDeleted --;
01203 }
01204 }
01205
01206 if (vDlds.size () == cDeleted)
01207 DeleteAllDownloads (bByUser);
01208 }
01209
01210 void fsWebPageDownloader::DetachFromDownloads()
01211 {
01212 for (size_t i = 0; i < m_vConfs.size (); i++)
01213 {
01214 vmsDownloadSmartPtr dld = m_vConfs [i].wp->dld;
01215 if (dld)
01216 dld->pfnDownloadEventsFunc = NULL;
01217 }
01218 }
01219
01220 BOOL fsWebPageDownloader::Save(HANDLE hFile)
01221 {
01222 DWORD dw;
01223
01224 if (FALSE == fsSaveStrToFile (m_strStartServer, hFile))
01225 return FALSE;
01226
01227 if (FALSE == WriteFile (hFile, &m_wpds.bDownloadFiles, sizeof (m_wpds.bDownloadFiles), &dw, NULL))
01228 return FALSE;
01229
01230 if (FALSE == WriteFile (hFile, &m_wpds.bDownloadImages, sizeof (m_wpds.bDownloadImages), &dw, NULL))
01231 return FALSE;
01232
01233 if (FALSE == WriteFile (hFile, &m_wpds.bDownloadStyles, sizeof (m_wpds.bDownloadStyles), &dw, NULL))
01234 return FALSE;
01235
01236 if (FALSE == WriteFile (hFile, &m_wpds.bNotAllFiles, sizeof (m_wpds.bNotAllFiles), &dw, NULL))
01237 return FALSE;
01238
01239 if (FALSE == WriteFile (hFile, &m_wpds.bNotAllImages, sizeof (m_wpds.bNotAllImages), &dw, NULL))
01240 return FALSE;
01241
01242 if (FALSE == WriteFile (hFile, &m_wpds.bNotAllPages, sizeof (m_wpds.bNotAllPages), &dw, NULL))
01243 return FALSE;
01244
01245 if (FALSE == WriteFile (hFile, &m_wpds.bSavePagesUnderHTM, sizeof (m_wpds.bSavePagesUnderHTM), &dw, NULL))
01246 return FALSE;
01247
01248 if (FALSE == WriteFile (hFile, &m_wpds.dwFlags, sizeof (m_wpds.dwFlags), &dw, NULL))
01249 return FALSE;
01250
01251 if (FALSE == WriteFile (hFile, &m_wpds.enExtsType, sizeof (m_wpds.enExtsType), &dw, NULL))
01252 return FALSE;
01253
01254 if (FALSE == WriteFile (hFile, &m_wpds.enImgsExtsType, sizeof (m_wpds.enImgsExtsType), &dw, NULL))
01255 return FALSE;
01256
01257 if (FALSE == WriteFile (hFile, &m_wpds.iDepth, sizeof (m_wpds.iDepth), &dw, NULL))
01258 return FALSE;
01259
01260 if (FALSE == WriteFile (hFile, &m_wpds.iReserved, sizeof (m_wpds.iReserved), &dw, NULL))
01261 return FALSE;
01262
01263 if (FALSE == WriteFile (hFile, &m_wpds.pDLGroup->nId, sizeof (m_wpds.pDLGroup->nId), &dw, NULL))
01264 return FALSE;
01265
01266 if (FALSE == fsSaveStrToFile (m_wpds.strExts, hFile))
01267 return FALSE;
01268
01269 if (FALSE == fsSaveStrToFile (m_wpds.strImgsExts, hFile))
01270 return FALSE;
01271
01272 if (FALSE == fsSaveStrToFile (m_wpds.strFolderSaveTo, hFile))
01273 return FALSE;
01274
01275 if (FALSE == fsSaveStrToFile (m_wpds.strHTMLExts, hFile))
01276 return FALSE;
01277
01278 if (FALSE == fsSaveStrToFile (m_wpds.strUserName, hFile))
01279 return FALSE;
01280
01281 if (FALSE == fsSaveStrToFile (m_wpds.strPassword, hFile))
01282 return FALSE;
01283
01284
01285 int cItems = m_wpds.vIgnoreList.size ();
01286 if (FALSE == WriteFile (hFile, &cItems, sizeof (int), &dw, NULL))
01287 return FALSE;
01288 for (int i = 0; i < cItems; i++)
01289 {
01290 if (FALSE == fsSaveStrToFile (m_wpds.vIgnoreList [i]->strURL, hFile))
01291 return FALSE;
01292
01293 if (FALSE == WriteFile (hFile, &m_wpds.vIgnoreList [i]->dwFlags, sizeof (DWORD),
01294 &dw, NULL))
01295 return FALSE;
01296 }
01297
01298 return Save (hFile, &m_pages);
01299 }
01300
01301 BOOL fsWebPageDownloader::Save(HANDLE hFile, t_wptree root)
01302 {
01303 DWORD dw;
01304
01305 fsDLWebPage *wp = &root->GetData ();
01306
01307 if (FALSE == WriteFile (hFile, &wp->uDldId, sizeof (wp->uDldId), &dw, NULL))
01308 return FALSE;
01309
01310 if (FALSE == WriteFile (hFile, &wp->bState, sizeof (wp->bState), &dw, NULL))
01311 return FALSE;
01312
01313 if (FALSE == WriteFile (hFile, &wp->nID, sizeof (wp->nID), &dw, NULL))
01314 return FALSE;
01315
01316 if (FALSE == fsSaveStrToFile (wp->strFile, hFile))
01317 return FALSE;
01318
01319 if (FALSE == fsSaveStrToFile (wp->strURL, hFile))
01320 return FALSE;
01321
01322 BYTE cUrls = (BYTE) wp->pvUrls->size ();
01323 if (FALSE == WriteFile (hFile, &cUrls, sizeof (cUrls), &dw, NULL))
01324 return FALSE;
01325 for (int j = 0; j < cUrls; j++)
01326 {
01327 if (FALSE == fsSaveStrToFile (wp->pvUrls->at (j), hFile))
01328 return FALSE;
01329 }
01330
01331 BOOL b = wp->pvUnpLinks && wp->pvUnpLinks->size ();
01332
01333 if (FALSE == WriteFile (hFile, &b, sizeof (b), &dw, NULL))
01334 return FALSE;
01335
01336 if (b)
01337 {
01338 if (FALSE == wp->pvUnpLinks->save (hFile))
01339 return FALSE;
01340 }
01341
01342 int cLeafs = root->GetLeafCount ();
01343 if (FALSE == WriteFile (hFile, &cLeafs, sizeof (cLeafs), &dw, NULL))
01344 return FALSE;
01345
01346 for (int i = 0; i < cLeafs; i++)
01347 if (FALSE == Save (hFile, root->GetLeaf (i)))
01348 return FALSE;
01349
01350 return TRUE;
01351 }
01352
01353 BOOL fsWebPageDownloader::Load_OLD(HANDLE hFile, BOOL bOldVer)
01354 {
01355 DWORD dw;
01356
01357 if (FALSE == fsReadStrFromFile (&m_strStartServer.pszString, hFile))
01358 return FALSE;
01359
01360 if (bOldVer)
01361 {
01362 fsWPDSettings_v1 wpds1;
01363 if (FALSE == ReadFile (hFile, &wpds1, sizeof (wpds1), &dw, NULL))
01364 return FALSE;
01365 m_wpds.bDownloadFiles = wpds1.bDownloadFiles;
01366 m_wpds.bDownloadImages = wpds1.bDownloadImages;
01367 m_wpds.bDownloadStyles = wpds1.bDownloadStyles;
01368 m_wpds.bNotAllFiles = wpds1.bNotAllFiles;
01369 m_wpds.bNotAllImages = wpds1.bNotAllImages;
01370 m_wpds.bNotAllPages = wpds1.bNotAllPages;
01371 m_wpds.enExtsType = wpds1.enExtsType;
01372 m_wpds.enImgsExtsType = wpds1.enImgsExtsType;
01373 m_wpds.iDepth = wpds1.iDepth;
01374 m_wpds.iReserved = wpds1.iReserved;
01375 wpds1.strDLGroup.pszString = NULL;
01376 wpds1.strExts.pszString = NULL;
01377 wpds1.strFolderSaveTo.pszString = NULL;
01378 wpds1.strHTMLExts.pszString = NULL;
01379 wpds1.strImgsExts.pszString = NULL;
01380 }
01381 else
01382 {
01383 if (FALSE == ReadFile (hFile, &m_wpds, sizeof (m_wpds) - 2*sizeof (BOOL), &dw, NULL))
01384 return FALSE;
01385
01386 m_wpds.bSavePagesUnderHTM = FALSE;
01387 m_wpds.dwFlags = 0;
01388 }
01389
01390 fsString str;
01391 if (FALSE == fsReadStrFromFile (&str.pszString, hFile))
01392 return FALSE;
01393 m_wpds.pDLGroup = _DldsGrps.FindGroupByName (str);
01394 if (m_wpds.pDLGroup == NULL)
01395 m_wpds.pDLGroup = _DldsGrps.FindGroup (GRP_OTHER_ID);
01396
01397 if (FALSE == fsReadStrFromFile (&m_wpds.strExts.pszString, hFile))
01398 return FALSE;
01399
01400 if (FALSE == fsReadStrFromFile (&m_wpds.strImgsExts.pszString, hFile))
01401 return FALSE;
01402
01403 if (FALSE == fsReadStrFromFile (&m_wpds.strFolderSaveTo.pszString, hFile))
01404 return FALSE;
01405
01406 if (FALSE == fsReadStrFromFile (&m_wpds.strHTMLExts.pszString, hFile))
01407 return FALSE;
01408
01409 if (bOldVer == FALSE)
01410 {
01411 if (FALSE == fsReadStrFromFile (&m_wpds.strUserName.pszString, hFile))
01412 return FALSE;
01413
01414 if (FALSE == fsReadStrFromFile (&m_wpds.strPassword.pszString, hFile))
01415 return FALSE;
01416 }
01417 else
01418 {
01419 m_wpds.strUserName.pszString = NULL;
01420 m_wpds.strPassword.pszString = NULL;
01421 }
01422
01423 m_nMaxID = 0;
01424
01425 return Load (hFile, &m_pages, 3);
01426 }
01427
01428 BOOL fsWebPageDownloader::Load(HANDLE hFile, t_wptree root, WORD wVer)
01429 {
01430 DWORD dw;
01431
01432 fsDLWebPage wp;
01433
01434 if (FALSE == ReadFile (hFile, &wp.uDldId, sizeof (wp.uDldId), &dw, NULL))
01435 return FALSE;
01436
01437 if (FALSE == ReadFile (hFile, &wp.bState, sizeof (wp.bState), &dw, NULL))
01438 return FALSE;
01439
01440 if (FALSE == ReadFile (hFile, &wp.nID, sizeof (wp.nID), &dw, NULL))
01441 return FALSE;
01442
01443 if (FALSE == fsReadStrFromFile (&wp.strFile.pszString, hFile))
01444 return FALSE;
01445
01446 if (FALSE == fsReadStrFromFile (&wp.strURL.pszString, hFile))
01447 return FALSE;
01448
01449 BYTE cUrls;
01450 if (FALSE == ReadFile (hFile, &cUrls, sizeof (cUrls), &dw, NULL))
01451 return FALSE;
01452 fsnew1 (wp.pvUrls, fs::list <fsString>);
01453 while (cUrls--)
01454 {
01455 fsString strURL;
01456 if (FALSE == fsReadStrFromFile (&strURL.pszString, hFile))
01457 return FALSE;
01458 wp.pvUrls->add (strURL);
01459 }
01460
01461 m_nMaxID = max (m_nMaxID, wp.nID);
01462
01463 wp.dld = NULL;
01464 wp.pvUnpLinks = NULL;
01465
01466 if (wp.uDldId != UINT (-1))
01467 {
01468 WebPage_FindDownload (&wp);
01469
01470 BOOL b;
01471
01472 if (wVer > 3)
01473 {
01474 if (FALSE == ReadFile (hFile, &b, sizeof (b), &dw, NULL))
01475 return FALSE;
01476 }
01477 else
01478 {
01479 b = wp.dld ? wp.dld->pMgr->IsDone () == FALSE : FALSE;
01480 }
01481
01482 if (b)
01483 {
01484 fsnew1 (wp.pvUnpLinks, fs::list <_WP_UnprocessedLinks>);
01485 if (FALSE == wp.pvUnpLinks->load (hFile))
01486 return FALSE;
01487 }
01488
01489 if (wp.dld)
01490 {
01491 wp.dld->pfnDownloadEventsFunc = _DldEvents;
01492 wp.dld->lpEventsParam = this;
01493
01494 wp.dld->dwFlags |= DLD_USEDBYHTMLSPIDER;
01495 }
01496 }
01497
01498 root->SetData (wp);
01499
01500 _Conformity conf;
01501 conf.wp = &root->GetData ();
01502 conf.wptree = root;
01503 m_vConfs.push_back (conf);
01504
01505 int cLeafs;
01506 if (FALSE == ReadFile (hFile, &cLeafs, sizeof (cLeafs), &dw, NULL))
01507 return FALSE;
01508
01509 for (int i = 0; i < cLeafs; i++)
01510 {
01511 fsDLWebPage wp;
01512 if (FALSE == Load (hFile, root->AddLeaf (wp), wVer))
01513 return FALSE;
01514 }
01515
01516 return TRUE;
01517 }
01518
01519 t_wptree fsWebPageDownloader::GetRootPage()
01520 {
01521 return &m_pages;
01522 }
01523
01524 BOOL fsWebPageDownloader::IsUrlsEqual(fsURL &url1, LPCSTR pszUrl)
01525 {
01526 fsURL url2;
01527 if (url2.Crack (pszUrl) == IR_SUCCESS)
01528 {
01529 if (url1.GetPort () == url2.GetPort () && url1.GetInternetScheme () == url2.GetInternetScheme () &&
01530 fsIsServersEqual(url1.GetHostName (), url2.GetHostName ()))
01531 {
01532
01533 if (stricmp (url2.GetPath (), url1.GetPath ()) == 0)
01534 return TRUE;
01535
01536 if (*url2.GetPath () == 0 || strcmp (url2.GetPath (), "/") == 0 || strcmp (url2.GetPath (), "\\") == 0)
01537 {
01538 char szFile [MY_MAX_PATH];
01539 if (fsFileNameFromUrlPath (url1.GetPath (), url1.GetInternetScheme () == INTERNET_SCHEME_FTP,
01540 FALSE, szFile, sizeof (szFile)))
01541 {
01542 LPCSTR pszPath = url1.GetPath ();
01543 if (*pszPath == '\\' || *pszPath == '/')
01544 pszPath++;
01545
01546 if (strcmp (szFile, pszPath) == 0)
01547 {
01548 char *pszExt = strrchr (szFile, '.');
01549 if (pszExt)
01550 {
01551
01552 if (pszExt - szFile == 5 && strncmp (szFile, "index", 5) == 0)
01553 {
01554
01555 if (IsExtInExtsStr (m_wpds.strHTMLExts, pszExt+1))
01556 return TRUE;
01557 }
01558 }
01559 }
01560 }
01561 }
01562 }
01563 }
01564
01565 return FALSE;
01566 }
01567
01568 void fsWebPageDownloader::OnDldRedirected(vmsDownloadSmartPtr dld)
01569 {
01570 fsString strNewUrl = dld->pMgr->get_URL (TRUE);
01571
01572 fsDLWebPage *wp = FindWebPage (strNewUrl);
01573 if (wp == NULL)
01574 {
01575 wp = FindWebPage (dld);
01576 wp->pvUrls->add (wp->strURL);
01577 wp->strURL = strNewUrl;
01578 }
01579 else
01580 {
01581
01582 fsDLWebPage* oldwp = FindWebPage (dld);
01583
01584 if (oldwp == wp)
01585 return;
01586
01587 CorrectUnpUrls (oldwp, wp);
01588
01589 DeleteWebPage (oldwp);
01590
01591 dld->dwFlags |= DLD_DELETEFILEALWAYS;
01592 _pwndDownloads->DeleteDownload (dld, FALSE);
01593
01594 wp->pvUrls->add (strNewUrl);
01595 }
01596 }
01597
01598 fsDLWebPage* fsWebPageDownloader::FindWebPage(UINT nID)
01599 {
01600 for (size_t i = 0; i < m_vConfs.size (); i++)
01601 {
01602 if (m_vConfs [i].wp->nID == nID)
01603 return m_vConfs [i].wp;
01604 }
01605
01606 return NULL;
01607 }
01608
01609 void fsWebPageDownloader::DeleteWebPage(fsDLWebPage *wp)
01610 {
01611 t_wptree tree = NULL;
01612
01613 for (size_t i = 0; i < m_vConfs.size (); i++)
01614 {
01615 if (m_vConfs [i].wp == wp)
01616 {
01617 tree = m_vConfs [i].wptree;
01618 break;
01619 }
01620 }
01621
01622 if (tree == NULL)
01623 return;
01624
01625 t_wptree root = tree->GetRoot ();
01626
01627 Event (WPDE_WEBPAGEWILLBEDELETED, (int) wp);
01628
01629 m_vConfs.erase (m_vConfs.begin () + i);
01630
01631 for (i = 0; i < (size_t)root->GetLeafCount (); i++)
01632 {
01633 if (root->GetLeaf (i)->GetData ().nID == wp->nID)
01634 {
01635 root->DeleteLeaf (i);
01636 break;
01637 }
01638 }
01639
01640 SAFE_DELETE (wp->pvUnpLinks);
01641 SAFE_DELETE (wp->pvUrls);
01642 }
01643
01644 BOOL fsWebPageDownloader::Load(HANDLE hFile, WORD wVer)
01645 {
01646 if(wVer < 3)
01647 return Load_OLD (hFile, wVer < 2);
01648
01649 DWORD dw;
01650
01651 if (FALSE == fsReadStrFromFile (&m_strStartServer.pszString, hFile))
01652 return FALSE;
01653
01654 if (FALSE == ReadFile (hFile, &m_wpds.bDownloadFiles, sizeof (m_wpds.bDownloadFiles), &dw, NULL))
01655 return FALSE;
01656
01657 if (FALSE == ReadFile (hFile, &m_wpds.bDownloadImages, sizeof (m_wpds.bDownloadImages), &dw, NULL))
01658 return FALSE;
01659
01660 if (FALSE == ReadFile (hFile, &m_wpds.bDownloadStyles, sizeof (m_wpds.bDownloadStyles), &dw, NULL))
01661 return FALSE;
01662
01663 if (FALSE == ReadFile (hFile, &m_wpds.bNotAllFiles, sizeof (m_wpds.bNotAllFiles), &dw, NULL))
01664 return FALSE;
01665
01666 if (FALSE == ReadFile (hFile, &m_wpds.bNotAllImages, sizeof (m_wpds.bNotAllImages), &dw, NULL))
01667 return FALSE;
01668
01669 if (FALSE == ReadFile (hFile, &m_wpds.bNotAllPages, sizeof (m_wpds.bNotAllPages), &dw, NULL))
01670 return FALSE;
01671
01672 if (FALSE == ReadFile (hFile, &m_wpds.bSavePagesUnderHTM, sizeof (m_wpds.bSavePagesUnderHTM), &dw, NULL))
01673 return FALSE;
01674
01675 if (FALSE == ReadFile (hFile, &m_wpds.dwFlags, sizeof (m_wpds.dwFlags), &dw, NULL))
01676 return FALSE;
01677
01678 if (FALSE == ReadFile (hFile, &m_wpds.enExtsType, sizeof (m_wpds.enExtsType), &dw, NULL))
01679 return FALSE;
01680
01681 if (FALSE == ReadFile (hFile, &m_wpds.enImgsExtsType, sizeof (m_wpds.enImgsExtsType), &dw, NULL))
01682 return FALSE;
01683
01684 if (FALSE == ReadFile (hFile, &m_wpds.iDepth, sizeof (m_wpds.iDepth), &dw, NULL))
01685 return FALSE;
01686
01687 if (FALSE == ReadFile (hFile, &m_wpds.iReserved, sizeof (m_wpds.iReserved), &dw, NULL))
01688 return FALSE;
01689
01690 if (wVer < 6)
01691 {
01692 fsString str;
01693 if (FALSE == fsReadStrFromFile (&str.pszString, hFile))
01694 return FALSE;
01695 m_wpds.pDLGroup = _DldsGrps.FindGroupByName (str);
01696 }
01697 else
01698 {
01699 UINT nId;
01700 if (FALSE == ReadFile (hFile, &nId, sizeof (nId), &dw, NULL))
01701 return FALSE;
01702 m_wpds.pDLGroup = _DldsGrps.FindGroup (nId);
01703 }
01704
01705 if (m_wpds.pDLGroup == NULL)
01706 m_wpds.pDLGroup = _DldsGrps.FindGroup (GRP_OTHER_ID);
01707
01708 if (FALSE == fsReadStrFromFile (&m_wpds.strExts.pszString, hFile))
01709 return FALSE;
01710
01711 if (FALSE == fsReadStrFromFile (&m_wpds.strImgsExts.pszString, hFile))
01712 return FALSE;
01713
01714 if (FALSE == fsReadStrFromFile (&m_wpds.strFolderSaveTo.pszString, hFile))
01715 return FALSE;
01716
01717 if (FALSE == fsReadStrFromFile (&m_wpds.strHTMLExts.pszString, hFile))
01718 return FALSE;
01719
01720 if (FALSE == fsReadStrFromFile (&m_wpds.strUserName.pszString, hFile))
01721 return FALSE;
01722
01723 if (FALSE == fsReadStrFromFile (&m_wpds.strPassword.pszString, hFile))
01724 return FALSE;
01725
01726 if (wVer >= 5)
01727 {
01728
01729 int cItems;
01730 ASSERT (m_wpds.vIgnoreList.size () == 0);
01731 if (FALSE == ReadFile (hFile, &cItems, sizeof (int), &dw, NULL))
01732 return FALSE;
01733 for (int i = 0; i < cItems; i++)
01734 {
01735 fsWPDIgnoreListItem *item = new fsWPDIgnoreListItem;
01736 if (FALSE == fsReadStrFromFile (&item->strURL.pszString, hFile))
01737 return FALSE;
01738
01739 if (FALSE == ReadFile (hFile, &item->dwFlags, sizeof (DWORD),
01740 &dw, NULL))
01741 return FALSE;
01742
01743 m_wpds.vIgnoreList.add (item);
01744 }
01745 }
01746
01747 m_nMaxID = 0;
01748
01749 if (FALSE == Load (hFile, &m_pages, wVer))
01750 {
01751 Load_PerformRollback ();
01752 return FALSE;
01753 }
01754
01755 return TRUE;
01756 }
01757
01758 #pragma warning (disable:4706)
01759 void fsWebPageDownloader::GetPtrToFile(LPCSTR pszToFile, LPCSTR pszFromFile, LPSTR pszPtr)
01760 {
01761
01762 int cComm = 0;
01763
01764 char szToPath [10000];
01765 char szFromPath [10000];
01766
01767 fsGetPath (pszToFile, szToPath);
01768 fsGetPath (pszFromFile, szFromPath);
01769
01770 int lto = strlen (szToPath);
01771 int lfrom = strlen (szFromPath);
01772
01773 LPCSTR pszTo = szToPath;
01774 LPCSTR pszFrom = szFromPath;
01775
01776 cComm = min (lto, lfrom);
01777
01778 while (TRUE)
01779 {
01780 if (strnicmp (pszTo, pszFrom, cComm) == 0 && pszTo [cComm-1] == '\\')
01781 break;
01782
01783 cComm--;
01784 if (cComm < 1)
01785 return;
01786 }
01787
01788 pszTo += cComm;
01789 pszFrom += cComm;
01790
01791 char szFile [10000];
01792 fsGetFileName (pszToFile, szFile);
01793
01794 int cUppers = 0;
01795
01796 while (pszFrom = strchr (pszFrom+1, '\\'))
01797 cUppers ++;
01798
01799 *pszPtr = 0;
01800
01801 while (cUppers--)
01802 strcat (pszPtr, "..\\");
01803
01804 strcat (pszPtr, pszTo);
01805 strcat (pszPtr, szFile);
01806 }
01807 #pragma warning (default:4706)
01808
01809 void fsWebPageDownloader::GetFileForReplace(fsDLWebPage *wpwhere, fsDLWebPage *wpwhat, LPSTR pszFile)
01810 {
01811 fsString strFileWhat = wpwhat->dld ? wpwhat->dld->pMgr->get_OutputFilePathName () : wpwhat->strFile;
01812
01813 if (m_wpds.dwFlags & WPDF_KEEPFOLDERSTRUCTURE)
01814 {
01815 fsString strFileWhere = wpwhere->dld ? wpwhere->dld->pMgr->get_OutputFilePathName () : wpwhere->strFile;
01816 GetPtrToFile (strFileWhat, strFileWhere, pszFile);
01817
01818 }
01819 else
01820 fsGetFileName (strFileWhat, pszFile);
01821
01822 fsPathToGoodUrlPath (pszFile);
01823 }
01824
01825 void fsWebPageDownloader::GetDownloadingSiteName(LPSTR psz)
01826 {
01827 fsURL url;
01828 DWORD dw = 10000;
01829 url.Crack (GetStartURL ());
01830 url.Create (url.GetInternetScheme (), url.GetHostName (), url.GetPort (),
01831 "", "", "", psz, &dw);
01832 }
01833
01834 void fsWebPageDownloader::Load_PerformRollback()
01835 {
01836 for (size_t i = 0; i < m_vConfs.size (); i++)
01837 {
01838 vmsDownloadSmartPtr dld = m_vConfs [i].wp->dld;
01839 if (dld)
01840 {
01841 dld->pfnDownloadEventsFunc = NULL;
01842 }
01843 }
01844 }
01845
01846 DWORD fsWebPageDownloader::OnCheckFileExtIsOK(vmsDownloadSmartPtr dld)
01847 {
01848 BOOL bOK = TRUE;
01849
01850 ASSERT (dld->pMgr->GetDownloadMgr () != NULL);
01851 if (dld->pMgr->GetDownloadMgr () == NULL)
01852 return TRUE;
01853
01854 char szFile [MY_MAX_PATH];
01855 fsFileNameFromUrlPath (dld->pMgr->GetDownloadMgr ()->GetDNP ()->pszPathName,
01856 dld->pMgr->GetDownloadMgr ()->GetDNP ()->enProtocol == NP_FTP,
01857 TRUE, szFile, sizeof (szFile));
01858
01859 LPCSTR pszExt = strrchr (szFile, '.');
01860
01861 BOOL bExt = FALSE;
01862
01863 if (pszExt++)
01864 bExt = IsExtInExtsStr (m_wpds.strExts, pszExt);
01865
01866 if (m_wpds.bDownloadFiles == FALSE)
01867 bOK = FALSE;
01868
01869 if (m_wpds.enExtsType != WPDET_OFF)
01870 {
01871 if ( (bExt && m_wpds.enExtsType == WPDET_NOTDOWNLOAD) ||
01872 (bExt == FALSE && m_wpds.enExtsType == WPDET_DOWNLOAD) )
01873 {
01874 bOK = FALSE;
01875 }
01876 }
01877
01878 if (m_wpds.bNotAllFiles)
01879 {
01880
01881 if (fsIsServersEqual (dld->pMgr->GetDownloadMgr ()->GetDNP ()->pszServerName, m_strStartServer) == FALSE)
01882 bOK = FALSE;
01883 }
01884
01885 return bOK;
01886 }
01887
01888 void fsWebPageDownloader::CorrectUnpUrls(fsDLWebPage* wpfrom, fsDLWebPage* wpto)
01889 {
01890 char szFile [MY_MAX_PATH];
01891
01892 if (wpfrom == wpto)
01893 return;
01894
01895 if (wpfrom->pvUnpLinks == NULL)
01896 return;
01897
01898 while (wpfrom->pvUnpLinks->size ())
01899 {
01900
01901 UINT nWPIDWhere = wpfrom->pvUnpLinks->at (0).nWPIDWhere;
01902 fsDLWebPage *wpwhere = FindWebPage (nWPIDWhere);
01903
01904 fs::list <UINT> vnUrls;
01905 fs::list <_WP_LinkType> vLinkTypes;
01906
01907 for (int j = 0; j < wpfrom->pvUnpLinks->size ();)
01908 {
01909
01910 if (nWPIDWhere == wpfrom->pvUnpLinks->at (j).nWPIDWhere)
01911 {
01912
01913 vnUrls.add (wpfrom->pvUnpLinks->at (j).nParserUrl);
01914 vLinkTypes.add (wpfrom->pvUnpLinks->at (j).lt);
01915
01916 wpfrom->pvUnpLinks->del (j);
01917 }
01918 else
01919 {
01920
01921 j++;
01922 }
01923 }
01924
01925 if (wpwhere == NULL)
01926 continue;
01927
01928 HANDLE hFile = CreateFile (wpwhere->dld ? wpwhere->dld->pMgr->get_OutputFilePathName () : wpwhere->strFile,
01929 GENERIC_READ | GENERIC_WRITE, FILE_SHARE_READ,
01930 NULL, OPEN_EXISTING, 0, NULL);
01931
01932 if (hFile == INVALID_HANDLE_VALUE)
01933 {
01934
01935 continue;
01936 }
01937
01938 DWORD dwSize = GetFileSize (hFile, NULL);
01939 LPSTR pszHTML;
01940 fsnew (pszHTML, char, int (dwSize*1.5) + 10000 + 1);
01941
01942 if (wpto)
01943 GetFileForReplace (wpwhere, wpto, szFile);
01944 else
01945 strcpy (szFile, wpfrom->strURL);
01946
01947 UINT newlen = 0;
01948 if (ReadFile (hFile, pszHTML, dwSize, &dwSize, NULL))
01949 {
01950 pszHTML [dwSize] = 0;
01951 fsHTMLParser parser;
01952 parser.SetKillDupes (FALSE);
01953 parser.ParseHTML (pszHTML);
01954
01955 for (j = 0; j < vnUrls.size (); j++)
01956 {
01957 switch (vLinkTypes [j])
01958 {
01959 case WPLT_A:
01960 parser.ReplaceUrl (vnUrls [j], szFile);
01961 break;
01962
01963 case WPLT_IMG:
01964 parser.ReplaceImage (vnUrls [j], szFile);
01965 break;
01966
01967 case WPLT_STYLESHEET:
01968 parser.ReplaceLinkUrl (vnUrls [j], szFile);
01969 break;
01970 }
01971 }
01972
01973 newlen = parser.GetHTMLLength ();
01974 SetFilePointer (hFile, 0, NULL, FILE_BEGIN);
01975 SetEndOfFile (hFile);
01976 DWORD dw;
01977 WriteFile (hFile, pszHTML, newlen, &dw, NULL);
01978 }
01979
01980 delete [] pszHTML;
01981 CloseHandle (hFile);
01982 }
01983 }
01984
01985 int fsWebPageDownloader::ParseHTMLFrameUrls(fsHTMLParser &parser, t_wptree wptree, BOOL bFixUrlsOnly, LPCSTR pszBaseURL)
01986 {
01987 int cAdded = 0;
01988
01989 for (int i = 0; i < parser.GetFrameUrlCount (); i ++)
01990 {
01991 LPCSTR pszUrl = parser.GetFrameUrl (i);
01992
01993 if (*pszUrl == '#')
01994 continue;
01995
01996 LPSTR pszFullUrl;
01997
01998 fsUrlToFullUrl (pszBaseURL, pszUrl, &pszFullUrl);
01999 if (pszFullUrl == NULL)
02000 continue;
02001
02002 fsURL url;
02003 if (IR_SUCCESS != url.Crack (pszFullUrl, TRUE))
02004 {
02005 delete [] pszFullUrl;
02006 continue;
02007 }
02008
02009 if (*url.GetHostName () == 0)
02010 {
02011 delete [] pszFullUrl;
02012 continue;
02013 }
02014
02015 char szFile [MY_MAX_PATH];
02016 fsFileNameFromUrlPath (url.GetPath (), url.GetInternetScheme () == INTERNET_SCHEME_FTP,
02017 TRUE, szFile, sizeof (szFile));
02018
02019 if (m_wpds.bNotAllPages)
02020 {
02021
02022 if (fsIsServersEqual (url.GetHostName (), m_strStartServer) == FALSE)
02023 {
02024 parser.ReplaceFrameUrl (i, pszFullUrl);
02025 delete [] pszFullUrl;
02026 continue;
02027 }
02028 }
02029
02030 for (int j = 0; j < m_wpds.vIgnoreList.size (); j++)
02031 {
02032 fsURL url2;
02033 url2.Crack (m_wpds.vIgnoreList [j]->strURL);
02034
02035 if (fsIsServersEqual (url.GetHostName (), url2.GetHostName ()))
02036 {
02037 if (_strnicmp (url.GetPath (), url2.GetPath (),
02038 lstrlen (url2.GetPath ())) == 0)
02039 {
02040 fsURL urlStart;
02041 urlStart.Crack (GetRootPage ()->GetData ().strURL);
02042
02043 if (FALSE == fsIsServersEqual (urlStart.GetHostName (), url2.GetHostName ()) ||
02044 _strnicmp (urlStart.GetPath (), url2.GetPath (),
02045 lstrlen (url2.GetPath ())))
02046 {
02047
02048 BOOL bSkip = m_wpds.vIgnoreList [j]->dwFlags & WPD_ILITEM_SUBFOLDERSALSO;
02049
02050 int l = lstrlen (url.GetPath ()), l2 = lstrlen (url2.GetPath ());
02051 if (bSkip == FALSE)
02052 bSkip = l == l2;
02053
02054 if (bSkip == FALSE && l > l2)
02055 bSkip = (url.GetPath ()) [l] == '#';
02056
02057 if (bSkip)
02058 {
02059 parser.ReplaceFrameUrl (i, pszFullUrl);
02060 SAFE_DELETE_ARRAY (pszFullUrl);
02061 break;
02062 }
02063 }
02064 }
02065 }
02066 }
02067
02068 if (pszFullUrl == NULL)
02069 continue;
02070
02071 LPSTR pszWA = NULL, pszFA = NULL;
02072
02073 if (bFixUrlsOnly == FALSE)
02074 {
02075 if (CrackUrl (pszFullUrl, &pszWA, &pszFA))
02076 {
02077 delete [] pszFullUrl;
02078 pszFullUrl = pszWA;
02079 }
02080 }
02081
02082 fsDLWebPage *wp = FindWebPage (pszFullUrl);
02083
02084 if (wp == NULL && bFixUrlsOnly == FALSE)
02085 {
02086
02087 wp = AddPage (wptree, pszFullUrl, WPDPT_PAGE, TRUE);
02088
02089 if (wp == NULL)
02090 {
02091 SAFE_DELETE_ARRAY (pszFA);
02092 parser.ReplaceFrameUrl (i, pszFullUrl);
02093 delete [] pszFullUrl;
02094 continue;
02095 }
02096 }
02097
02098 if (wp && wp->pvUnpLinks)
02099 {
02100 _WP_UnprocessedLinks unplink;
02101 unplink.nWPIDWhere = wptree->GetData ().nID;
02102 unplink.nParserUrl = i;
02103 unplink.lt = WPLT_A;
02104 wp->pvUnpLinks->add (unplink);
02105 }
02106
02107 if ((m_wpds.dwFlags & WPDF_DONTSTOREPAGES) == 0)
02108 {
02109 if (wp)
02110 {
02111
02112 GetFileForReplace (&wptree->GetData (), wp, szFile);
02113 if (pszFA)
02114 strcat (szFile, strchr (pszFA, '#'));
02115
02116 parser.ReplaceFrameUrl (i, szFile);
02117 }
02118 else
02119 {
02120 parser.ReplaceFrameUrl (i, pszFullUrl);
02121 }
02122 }
02123
02124 delete [] pszFullUrl;
02125 }
02126
02127 return cAdded;
02128 }
02129
02130 BOOL fsWebPageDownloader::IsURLShouldBeIgnored(fsURL &url)
02131 {
02132
02133 BOOL bOnlyConditionPresent = FALSE;
02134
02135 for (int j = 0; j < m_wpds.vIgnoreList.size (); j++)
02136 {
02137 BOOL bOnlyCondition = m_wpds.vIgnoreList [j]->dwFlags & WPD_ILITEM_THISPATHONLY;
02138 if (bOnlyCondition)
02139 bOnlyConditionPresent = TRUE;
02140
02141 fsURL url2;
02142 url2.Crack (m_wpds.vIgnoreList [j]->strURL);
02143
02144 BOOL bSQ = FALSE;
02145
02146 if (fsIsServersEqual (url.GetHostName (), url2.GetHostName ()))
02147 {
02148 if (_strnicmp (url.GetPath (), url2.GetPath (),
02149 lstrlen (url2.GetPath ())) == 0)
02150 {
02151 bSQ = m_wpds.vIgnoreList [j]->dwFlags & WPD_ILITEM_SUBFOLDERSALSO;
02152
02153
02154 if (bSQ == FALSE)
02155 {
02156 int l = lstrlen (url.GetPath ()),
02157 l2 = lstrlen (url2.GetPath ());
02158
02159 bSQ = l == l2;
02160
02161
02162 if (bSQ == FALSE && l > l2)
02163 bSQ = (url.GetPath ()) [l2] == '#' || (url.GetPath ()) [l2] == '?';
02164 }
02165 }
02166 }
02167
02168 if (bSQ)
02169 return bOnlyCondition ? FALSE : TRUE;
02170 }
02171
02172 return bOnlyConditionPresent ? TRUE : FALSE;
02173 }