1: 2: 3: 4: 5: 6: 7: 8: 9: 10: 11: 12: 13: 14: 15: 16: 17: 18: 19: 20: 21: 22: 23: 24: 25: 26: 27: 28: 29: 30: 31: 32: 33: 34: 35: 36: 37: 38: 39: 40: 41: 42: 43: 44: 45: 46: 47: 48: 49: 50: 51: 52: 53: 54: 55: 56: 57: 58: 59: 60: 61: 62: 63: 64: 65: 66: 67: 68: 69: 70: 71: 72: 73: 74: 75: 76: 77: 78: 79: 80: 81: 82: 83: 84: 85: 86: 87: 88: 89: 90: 91: 92: 93: 94: 95: 96: 97: 98: 99: 100: 101: 102: 103: 104: 105: 106: 107: 108: 109: 110: 111: 112: 113: 114: 115: 116: 117: 118: 119: 120: 121: 122: 123: 124: 125: 126: 127: 128: 129: 130: 131: 132: 133: 134: 135: 136: 137: 138: 139: 140: 141: 142: 143: 144: 145: 146: 147: 148: 149: 150: 151: 152: 153: 154: 155: 156: 157: 158: 159: 160: 161: 162: 163: 164: 165: 166: 167: 168: 169: 170: 171: 172: 173: 174: 175: 176: 177: 178: 179: 180: 181: 182: 183: 184: 185: 186: 187: 188: 189: 190: 191: 192: 193: 194: 195: 196: 197: 198: 199: 200: 201: 202: 203: 204: 205: 206: 207: 208: 209: 210: 211: 212: 213: 214: 215: 216: 217: 218: 219: 220: 221: 222: 223: 224: 225: 226: 227: 228: 229: 230: 231: 232: 233: 234: 235: 236: 237: 238: 239: 240: 241: 242: 243: 244: 245: 246: 247: 248: 249: 250: 251: 252: 253: 254: 255: 256: 257: 258: 259: 260: 261: 262: 263: 264: 265: 266: 267: 268: 269: 270: 271: 272: 273: 274: 275: 276: 277: 278: 279: 280: 281: 282: 283: 284: 285: 286: 287: 288: 289: 290: 291: 292: 293: 294: 295: 296: 297: 298: 299: 300: 301: 302: 303: 304: 305: 306: 307: 308: 309: 310: 311: 312: 313: 314: 315: 316: 317: 318: 319: 320: 321: 322: 323: 324: 325: 326: 327: 328: 329: 330: 331: 332: 333: 334: 335: 336: 337: 338: 339: 340: 341: 342: 343: 344: 345: 346: 347: 348: 349: 350: 351: 352: 353: 354: 355: 356: 357: 358: 359: 360: 361: 362: 363: 364: 365: 366: 367: 368: 369: 370: 371: 372: 373: 374: 375: 376: 377: 378: 379: 380: 381: 382: 383: 384: 385: 386: 387: 388: 389: 390: 391: 392: 393: 394: 395: 396: 397: 398: 399: 400: 401: 402: 403: 404: 405: 406: 407: 408: 409: 410: 411: 412: 413: 414: 415: 416: 417: 418: 419: 420: 421: 422: 423: 424: 425: 426: 427: 428: 429: 430: 431: 432: 433: 434: 435: 436: 437: 438: 439: 440: 441: 442: 443: 444: 445: 446: 447: 448: 449: 450: 451: 452: 453: 454: 455: 456: 457: 458: 459: 460: 461: 462: 463: 464: 465: 466: 467: 468: 469: 470: 471: 472: 473: 474: 475: 476: 477: 478: 479: 480: 481: 482: 483: 484: 485: 486: 487: 488: 489: 490: 491: 492: 493: 494: 495: 496: 497: 498: 499: 500: 501: 502: 503: 504: 505: 506: 507: 508: 509: 510: 511: 512: 513: 514: 515: 516: 517: 518: 519: 520: 521: 522: 523: 524: 525: 526: 527: 528: 529: 530: 531: 532: 533: 534: 535: 536: 537: 538: 539: 540: 541: 542: 543: 544: 545: 546: 547: 548: 549: 550: 551: 552: 553: 554: 555: 556: 557: 558: 559: 560: 561: 562: 563: 564: 565: 566: 567: 568: 569: 570: 571: 572: 573: 574: 575: 576: 577: 578: 579: 580: 581: 582: 583: 584: 585: 586: 587: 588: 589: 590: 591: 592: 593: 594:
| unit uMultiThreadedWebspider;
interface
uses System.Classes, System.SysUtils, Winapi.ActiveX, Winapi.UrlMon, Vcl.Dialogs;
function UrlDownloadToStream(URL: String; Stream: TStream): Boolean; function CheckUTF8(tempstring : string) : integer; function PrepareURLforAnalysis(origurl,parsedurlhost,parsedurlprotocol: string): string;
var ActiveThreads : integer;
const sGeneralLinkTagStart = '<a'; sGeneralLinkTagEnd = '/a>'; sGeneralHrefTagStart = 'href="'; sGeneralHrefTagEnd = '"'; sGeneralMetaLinkTagStart = '<link '; sGeneralMetaLinkTagEnd = '>';
utNone = 0; utInternalURL = 1; utExternalURL = 2; utInternalImageLinkURL = 3; utExternalImageLinkURL = 4; utInternalFeedLinkURL = 5; utFileLinkURL = 6; utImageLinkURL = 7; utImageURL = 8; utNotAnalysedURL = 9; utExternalFeedLinkURL = 10;
smNone = 0; smInternalURL = 1; smExternalURL = 2; smInternalImageLinkURL = 3; smExternalImageLinkURL = 4; smInternalFeedLinkURL = 5; smFileLinkURL = 6; smImageLinkURL = 7; smImageURL = 8; smNotAnalysedURL = 9; smExternalFeedLinkURL = 10;
damNone = 0; damInternalURL = 1; damExternalURL = 2; damInternalImageLinkURL = 3; damExternalImageLinkURL = 4; damInternalFeedLinkURL = 5; damFileLinkURL = 6; damImageLinkURL = 7; damImageURL = 8; damNotAnalysedURL = 9; damExternalFeedLinkURL = 10;
type TWebURLParentDataItem = packed record URLString : string; OrigURLString : string; URLLinkString : string; end; PWebURLParentDataItem = ^TWebUrlParentDataItem;
TWebURLDataItem = packed record URLString : string; OrigURLString : string; URLLinkString : string; URLType : integer; Analysed : boolean; DeepAnalysed : boolean; HTTPStatus : integer; PageRank : integer; InternalLinks : integer; ExternalLinks : integer; SiteTitle : string; METATagKeywords : string; METATagDescription : string; SiteFollowTag : integer; FileSize : int64; ParentSites : TList; end; PWebURLDataItem = ^TWebURLDataItem;
TURLAnalysisBegin = procedure() of object; TURLAnalysisDone = procedure(index : integer) of object; TAddInternalLink = procedure(origurl,origlink,url : string) of object;
TURLAnalyser = class(TThread) strict private FFormHandle : THandle; FRootURL : string; FAnalyseURL : string; FParsedURLProtocol : string; FParsedURLHost : string; FIgnoreURLType : boolean; FURLName : string; FWWWURLName : string; FItemIndex : integer; Forigurlname : string; FStripParams : boolean; FDetectAllImages : boolean; FDetectFeeds : boolean; FSenderMode : integer;
FResultOrigURL : string; FResultURL : string; FResultOrigLink : string; FURLAnalysisBegin : TURLAnalysisBegin; FURLAnalysisDone : TURLAnalysisDone; FAddInternalLink : TAddInternalLink; procedure SyncURLAnalysisBegin; procedure SyncURLAnalysisDone; procedure SyncAddInternalLink; protected procedure Execute; override; public constructor Create(RootURL,AnalyseURL : string;IgnoreURLType : boolean;Index : integer); destructor Destroy; override; property URLAnalysisBegin : TURLAnalysisBegin read FURLAnalysisBegin write FURLAnalysisBegin; property URLAnalysisDone : TURLAnalysisDone read FURLAnalysisDone write FURLAnalysisDone; property AddInternalLink : TAddInternalLink read FAddInternalLink write FAddInternalLink; end;
implementation
function UrlDownloadToStream(URL: String; Stream: TStream): Boolean; var ppStream: IStream; statstg: TStatStg; lpBuffer: Pointer; dwRead: Integer; begin result:=False;
if Assigned(Stream) then begin if (URLOpenBlockingStream(nil, PChar(URL), IStream(ppStream), 0, nil) = S_OK) then begin try if (ppStream.Stat(statstg, STATFLAG_NONAME) = S_OK) then begin if (statstg.cbSize > 0) then begin lpBuffer:=AllocMem(statstg.cbSize); try if (ppStream.Read(lpBuffer, statstg.cbSize, @dwRead) = S_OK) then begin Stream.Write(lpBuffer^, dwRead); try Stream.Seek(-dwRead, soFromCurrent); finally result:=True; end; end; finally FreeMem(lpBuffer); end; end; end; finally ppStream:=nil; end; end; end; end;
function CheckUTF8(tempstring : string) : integer; begin result := -1;
if POS('ä',tempstring,1) <> 0 then result := 1; if POS('ü',tempstring,1) <> 0 then result := 1; if POS('ö',tempstring,1) <> 0 then result := 1; if POS('Ø',tempstring,1) <> 0 then result := 1; exit; if POS('&',tempstring,1) <> 0 then result := 2; if POS('ä',tempstring,1) <> 0 then result := 2; if POS('Ä',tempstring,1) <> 0 then result := 2; if POS('ü',tempstring,1) <> 0 then result := 2; if POS('Ü',tempstring,1) <> 0 then result := 2; if POS('ö',tempstring,1) <> 0 then result := 2; if POS('Ö',tempstring,1) <> 0 then result := 2;
end;
function PrepareURLforAnalysis(origurl,parsedurlhost,parsedurlprotocol: string): string; var tempurlstring : string; begin if origurl = '' then exit; tempurlstring := origurl; if tempurlstring[1] = '.' then delete(tempurlstring,1,1); tempurlstring := StringReplace(tempurlstring,'/http','http',[rfReplaceAll]); tempurlstring := StringReplace(tempurlstring,'/https','https',[rfReplaceAll]); tempurlstring := StringReplace(tempurlstring,'http//','http://',[rfReplaceAll]); tempurlstring := StringReplace(tempurlstring,'https//','https://',[rfReplaceAll]); tempurlstring := StringReplace(tempurlstring,'http:/http','http',[rfReplaceAll]); if ((POS(UpperCase(ParsedURLHost),UpperCase(tempurlstring)) = 0) and (POS(UpperCase('http://'),UpperCase(tempurlstring)) = 0) and (POS(UpperCase('https://'),UpperCase(tempurlstring)) = 0)) then begin if ((tempurlstring[1] <> '/') and (tempurlstring[1] <> '?')) then begin if POS(UpperCase(ParsedURLProtocol + '://'),UpperCase(tempurlstring),1) = 0 then tempurlstring := '/' + tempurlstring; end; if POS(UpperCase('www.'),UpperCase(origurl),1) <> 0 then begin if POS(UpperCase(ParsedURLProtocol + '://'),UpperCase(tempurlstring),1) = 0 then tempurlstring := ParsedURLProtocol + '://www.' + ParsedURLHost + tempurlstring end else begin if POS(UpperCase(ParsedURLProtocol + '://'),UpperCase(tempurlstring),1) = 0 then tempurlstring := ParsedURLProtocol + '://' + ParsedURLHost + tempurlstring end; end; tempurlstring := StringReplace(tempurlstring,'www.www','www',[rfReplaceAll]); result := tempurlstring; end;
function parseProtocol(url: string): string; var i: integer; begin
i := pos('://', Url); if i <> 0 then begin Result := Copy(url, 1, i - 1); end else begin result := 'http'; end; end;
function parseHost(url: string): string; var i, pathSep, portSep: integer; tmpstr: string; begin tmpstr := url;
i := pos('://', tmpstr); if i <> 0 then begin Inc(i, 2); end; Delete(tmpstr, 1, i);
pathSep := Pos('/', tmpstr); if pathSep <> 0 then begin Delete(tmpstr, pathSep, Length(tmpstr)); end;
portSep := Pos(':', tmpstr); if portSep <> 0 then begin Delete(tmpstr, portSep, Length(tmpstr)); end;
i := pos('?', tmpstr); if i <> 0 then begin Delete(tmpstr, i, Length(tmpstr)); end;
i := pos('#', tmpstr); if i <> 0 then begin Delete(tmpstr, i, Length(tmpstr)); end;
Result := tmpstr; end;
function ExtractURLFileName(URL: string): string; var i: Integer; temphost : string; temppos : integer; begin Result := ''; temphost := ''; temphost := parseHost(Url); if temphost <> '' then temppos := POS(Uppercase(temphost),UpperCase(Url),1); if temppos <> 0 then begin Url := COPY(Url,temppos + Length(temphost),Length(Url)-Length(temphost)+1); end; if POS(Uppercase('http://'),UpperCase(Url),1) = 1 then Url := COPY(Url,Length('http://'),Length(Url)-Length('http://')+1); if POS(Uppercase('https://'),UpperCase(Url),1) = 1 then Url := COPY(Url,Length('https://'),Length(Url)-Length('https://')+1); if POS(Uppercase('http://www.'),UpperCase(Url),1) = 1 then Url := COPY(Url,Length('http://www.'),Length(Url)-Length('http://www.')+1); if POS(Uppercase('https://www.'),UpperCase(Url),1) = 1 then Url := COPY(Url,Length('https://www.'),Length(Url)-Length('https://www.')+1); i := LastDelimiter('/', Url); Result := Copy(Url, i + 1, Length(Url) - (i)); end;
constructor TURLAnalyser.Create(RootURL,AnalyseURL: string;IgnoreURLType : boolean;index : integer); begin inherited Create(true); Synchronize(SyncURLAnalysisBegin); FItemIndex := Index; FRootURL := RootURL; FAnalyseURL := AnalyseURL; FIgnoreURLType := IgnoreURLType; FParsedURLProtocol := parseProtocol(RootURL); FParsedURLHost := parseHost(RootURL); Furlname := FParsedURLProtocol + '://' + FParsedURLHost; Fwwwurlname := FParsedURLProtocol + '://www.' + FParsedURLHost; end;
destructor TURLAnalyser.Destroy; begin Synchronize(SyncURLAnalysisDone); inherited; end;
procedure TURLAnalyser.Execute; var URLSource : TStringStream; urlstring : string; tempencoding : TEncoding; tempSiteTitleTagString : string; tempstring,tempstatuscode,completerobotstagstring,completelinkstring,completeurlstring,completeoriginalurlstring,temppagerank : string; sitetitletagstartpos,titletagpos,URLIndex,templistindex,i,robotstagstartpos,robotstagendpos,hrefstartpos,hrefendpos,linkstartpos,linkendpos,imagetagpos : integer; urlglobalfollowtag,internallinks,externallinks : integer; tempkeywords, tempdescription, temporigurl : string; TempList : TList; begin try if FAnalyseURL <> '' then begin URLSource := TStringStream.Create; if UrlDownloadToStream(FAnalyseURL,URLSource) = true then begin case CheckUTF8(URLSource.DataString) of -1 : urlstring := URLSource.DataString; 1 : urlstring := Utf8ToString(URLSource.DataString); 2 : urlstring := URLSource.DataString; end; hrefstartpos := 1; repeat hrefstartpos := POS(UpperCase(sGeneralHrefTagStart),UpperCase(urlstring),hrefstartpos); if hrefstartpos <> 0 then begin hrefendpos := POS(UpperCase(sGeneralhreftagend),urlstring,hrefstartpos+Length(sGeneralHrefTagStart)); if hrefendpos <> 0 then begin completeoriginalurlstring := COPY(urlstring,hrefstartpos+Length(sGeneralHrefTagStart),hrefendpos-hrefstartpos-Length(sGeneralHrefTagEnd)-Length(sGeneralHrefTagStart)+1); FResultOrigURL := completeoriginalurlstring; completeurlstring := PrepareURLforAnalysis(completeoriginalurlstring,FParsedURLHost,FParsedURLProtocol); FResultURL := completeurlstring; for i := hrefstartpos downto 0 do begin if ((UpperCase(urlstring[i]) = UpperCase('<')) and (UpperCase(urlstring[i+1]) = UpperCase('a')) and (UpperCase(urlstring[i+2]) = UpperCase(' '))) then begin linkstartpos := i; linkendpos := POS(UpperCase(sGeneralLinkTagEnd),UpperCase(urlstring),linkstartpos+Length(sGeneralLinkTagEnd)); if linkendpos <> 0 then begin completelinkstring := COPY(urlstring,linkstartpos,linkendpos-linkstartpos+Length(sGeneralLinkTagEnd)); FResultOrigLink := completelinkstring; break; end else begin end; end else begin end; if ((UpperCase(urlstring[i]) = UpperCase('<')) and (UpperCase(urlstring[i+1]) = UpperCase('l')) and (UpperCase(urlstring[i+2]) = UpperCase('i')) and (UpperCase(urlstring[i+3]) = UpperCase('n')) and (UpperCase(urlstring[i+4]) = UpperCase('k')) and (UpperCase(urlstring[i+5]) = UpperCase(' '))) then begin linkstartpos := i; linkendpos := POS(UpperCase(sGeneralMetaLinkTagEnd),UpperCase(urlstring),linkstartpos+length(sGeneralMetaLinkTagStart)); if linkendpos <> 0 then begin completelinkstring := COPY(urlstring,linkstartpos,linkendpos-linkstartpos+Length(sGeneralLinkTagEnd)-1); FResultOrigLink := completelinkstring; break; end else begin end; end else begin end; end; if ((POS(UpperCase(FRootURL),UpperCase(completeurlstring),1) = 1) and (FIgnoreURLType = false)) or (((POS(UpperCase(Furlname),UpperCase(completeurlstring),1) = 1) or (POS(UpperCase(Fwwwurlname),UpperCase(completeurlstring),1) = 1)) and (FIgnoreURLType = true)) then begin tempstring := ExtractURLFileName(completeurlstring); if ((tempstring <> '') and (tempstring <> ' ') and (tempstring <> '/') and (POS('?',tempstring,1) = 0) and (POS('%',tempstring,1) = 0) and (POS('&',tempstring,1) = 0) and (POS(':',tempstring,1) = 0) and (POS('#',tempstring,1) = 0) and (POS('.php',tempstring,1) = 0) and (POS('.htm',tempstring,1) = 0) and (POS('.',tempstring,1) <> 0)) or (POS('.css',tempstring,1) <> 0) or (POS('.js',tempstring,1) <> 0) then begin end else begin Synchronize(SyncAddInternalLink);
end; end else begin
end; if linkendpos > hrefendpos then hrefstartpos := linkendpos else hrefstartpos := hrefendpos+Length(sGeneralLinkTagEnd); end else begin hrefstartpos := hrefstartpos + length(sGeneralHrefTagStart) + length(sGeneralHrefTagEnd); end; end else begin end; until hrefstartpos = 0; end else begin end; end; except on E: Exception do begin end; end; if URLSource <> nil then freeandnil(URLSource); end;
procedure TURLAnalyser.SyncAddInternalLink; begin if Assigned(AddInternalLink) then FAddInternalLink(FResultOrigURL,FResultOrigLink,FResultURL); end;
procedure TURLAnalyser.SyncURLAnalysisBegin; begin if Assigned(URLAnalysisBegin) then FURLAnalysisBegin(); end;
procedure TURLAnalyser.SyncURLAnalysisDone; begin if Assigned(URLAnalysisDone) then FURLAnalysisDone(FItemIndex); end;
end. |