C++或C抓取网页内容

合集下载
  1. 1、下载文档前请自行甄别文档内容的完整性,平台不提供额外的编辑、内容补充、找答案等附加服务。
  2. 2、"仅部分预览"的文档,不可在线预览部分如存在完整性等问题,可反馈申请退款(可完整预览的文档不适用该条件!)。
  3. 3、如文档侵犯您的权益,请联系客服反馈,我们会尽快为您处理(人工客服工作时间:9:00-18:30)。

GetMethod httpMethod = new GetMethod("");
int statusCode = new HttpClient().executeMethod(httpMethod);
if(statusCode == HttpStatus.SC_OK)
using namespace std;
#define DEFAULT_PAGE_BUF_SIZE 1048576
void main()
{
WSADATA wsaData;
int err;
err = WSAStartup(MAKEWORD(2,2), &wsaData);
if( err != 0 )
connect(sock, (SOCKADDR*)&sa, sizeof(sa));
if(sock == -1)
{
return;
}
if(sock == -2)
{
return;
}
// send the "GET" data
ret = send(sock, request, strlen(request), 0);
struct hostent *hp;
hp = gethostbyname(host);
if(hp == NULL)
{
cout << "gethostbyname() error in GetIpByHost: " << host << endl;
return;
printf(" %s\n " ,(LPCTSTR)content);
}
pfile -> Close();
delete pfile;
session.Close();
return 0 ;
sa.sin_family = AF_INET;
sa.sin_port = htons(80);
sa.sin_addr.s_addr = inet_addr(inet_ntoa(inAddr));
sockTO_TCP);
}
Windows下用socket:
#include <string>
#include <iostream>
#include <fstream>
#include "winsock2.h"
#include <time.h>
#pragma comment(lib, "ws2_32.lib")
C++或C抓取网页内容
2009-03-04 18:30
Windows VC下的做法:
#include <stdio.h>
#include <afxinet.h>
int main(int argc, char* argv[])
{
CInternetSession session("HttpClient");
{
return;
}
// timer is start
clock_t start, finish;
double duration;
start = clock();
char host[] = "";
char *request = "GET / HTTP/1.0\r\nHost: \r\nConnection: Close\r\n\r\n";
ofstream ofs;
ofs.open("ofs.txt");
ofs << pageBuf << endl;
ofs.close();
free(pageBuf);
closesocket(sock);
WSACleanup();
// timer is finish
finish = clock();
// 网页内容长度。可以从http头部数据中获取 "Content-Length:"
int m_nContentLength = DEFAULT_PAGE_BUF_SIZE;
char *pageBuf;
pageBuf = (char *)malloc(m_nContentLength);
if(ret > 0)
{
bytesRead += ret;
}
}
pageBuf[bytesRead] = '\0';
cout << bytesRead << endl;
// write the html content to the file
}
// 获取域名对应的IP
struct in_addr inAddr;
LPSTR lpAddr;
lpAddr = hp->h_addr;
memmove(&inAddr,lpAddr,4);
int sock, ret = 0, optval = 1;
struct sockaddr_in sa;
pfile -> QueryInfoStatusCode(dwStatusCode);
if(dwStatusCode == HTTP_STATUS_OK)
{
CString content;
CString data;
方法2. system(curl )
方法3. 直接写socket程序获取的内容存入buffer中。
memset(pageBuf, 0, m_nContentLength);
int bytesRead = 0;
while(ret > 0)
{
ret = recv(sock, pageBuf + bytesRead, m_nContentLength - bytesRead, 0);
char * url = " /simcard.php?simcard=1392658";
CHttpFile* pfile = (CHttpFile *)session.OpenURL(url);
DWORD dwStatusCode;
while (pfile -> ReadString(data))
{
content += data + "\r\n";
}
content.TrimRight();
内容取过来之后,总是希望从中拣出需要的数据,可惜 VC6 中没有自己的正则表达式库,所以下一步要学用 boost 的正则表达式库。
Linux 下最简单,三种办法: #34du.html。
{
System.out.println(httpMethod.getResponseBodyAsString());
}
httpMethod.releaseConnection();
GetMethod httpMethod = new GetMethod(""); int statusCode = new HttpClient().executeMethod(httpMethod); if(statusCode == HttpStatus.SC_OK) { System.out.println(httpMethod.getResponseBodyAsString()); } httpMethod.releaseConnection();
duration = (double)(finish - start) / CLOCKS_PER_SEC;
cout << "have cost " << duration << " seconds\n";
return;
}
其他如不从缓存中读取内容及如何使用代理连接现在就不说了,可以参考下面的链接,或者下次补上。另外不妨看看 Java 是如何读取 URL 内容的,更简单
相关文档
最新文档