C#使用HtmlAgilityPack爬虫实例

    xiaoxiao2021-03-25  151

    使用HtmlAgilityPack类库解析html非常方便,网上的资料有很多,可以自行搜索了解

    下面上一个非常简单的小例子

    要爬取的信息如下:

    首先要引用HtmlAgilityPack.dll文件

    上代码:

    internal void Run() { string url = "爬取网站URL"; string res = HttpTool.Excute(url); //发送请求得到页面 HtmlDocument doc = new HtmlDocument(); doc.LoadHtml(res); //加载html HtmlNode documentNode = doc.DocumentNode; string hname = documentNode.SelectSingleNode("//div[@class='main-title']/h1").InnerText.Trim(); //定位到节点获取文本值 string address = documentNode.SelectSingleNode("//div[@class='location']/span").InnerText.Trim(); } HttpTool类

    using System; using System.IO; using System.Text; using System.Net; namespace Spider { public class HttpTool : IDisposable { public static string Excute(string url, string data = null) { for (int i = 0; i < 3; i++) //尝试3次连接 { try { using (HttpTool tool = new HttpTool(url, data)) { return tool.Result; } } catch (Exception ex) { //异常处理 } } return null; } public string Url { get; set; } public HttpWebRequest Request { get; private set; } public HttpWebResponse Response { get; private set; } public string Result { get; private set; } public HttpTool(string url, string postData = null) { Url = url; InitRequest(); if (string.IsNullOrEmpty(postData)) { InitResponse(); } else { InitResponse(postData); } var stream = Response.GetResponseStream(); var sr = new StreamReader(Response.GetResponseStream()); Result = sr.ReadToEnd(); stream.Dispose(); sr.Close(); sr.Dispose(); } /// <summary> /// 初始化请求 请求头信息可以按自己需求增加 /// </summary> public virtual void InitRequest() { Request = (HttpWebRequest)WebRequest.Create(Url); //Request.UserAgent = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; QQWubi 133; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; CIBA; InfoPath.2)"; //Request.KeepAlive = false; Request.Accept = "Accept text/html, application/xhtml+xml, */*"; Request.UserAgent = "User-Agent Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"; Request.Timeout = 6000; Request.ReadWriteTimeout = 6000; } /// <summary> /// 初始化响应报文,GET /// </summary> public void InitResponse() { Request.Method = "GET"; Response = (HttpWebResponse)Request.GetResponse(); } /// <summary> /// 初始化响应报文,POST /// </summary> /// <param name="postData">POST参数</param> /// <param name="encoding">编码方式</param> public void InitResponse(string postData, Encoding encoding) { Request.Method = "POST"; byte[] bs = encoding.GetBytes(postData); Request.ContentLength = bs.Length; var reqStream = Request.GetRequestStream(); reqStream.Write(bs, 0, bs.Length); reqStream.Close(); reqStream.Dispose(); Response = (HttpWebResponse)Request.GetResponse(); } /// <summary> /// 初始化响应报文,POST,UTF-8序列化 /// </summary> /// <param name="postData">POST参数</param> public void InitResponse(string postData) { InitResponse(postData, Encoding.UTF8); } } }

    转载请注明原文地址: https://ju.6miu.com/read-11743.html

    最新回复(0)