macOS 用PHP调用curl抓取不到https的数据
泻药,你好1.header的信息尽量写全2.抓取https的网页需要添加上cookie信息,你加上cookie信息试一下3.加上CURLOPT_USERAGENT,和 CURLOPT_REFERER的信息https://github.com/ejoful/curl/blob/master/taobao.php 这里是我前不久写的抓取taobao搜索商品页面的代码,亲测有效,你可以试试,在config.php写上你的数据库配置信息,taobao_data.sql是数据表的信息,直接在cmd中运行taobao.php 就行,tb_product(2).xlsx是抓取出的商品信息列表。重点代码:function get_html( $url, $row, $con ){ $ch = curl_init(); // 设置浏览器的特定header curl_setopt($ch, CURLOPT_HTTPHEADER, array( "Host: s.taobao.com", "Connection: keep-alive", "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Upgrade-Insecure-Requests: 1", "DNT:1", "Accept-Language: zh-CN,zh;q=0.8,en-GB;q=0.6,en;q=0.4,en-US;q=0.2", "Cookie:cna=ecujDgxJEU8CAdrwlTLMuyK+; thw=cn; miid=7140089985405878683; _m_user_unitinfo_=center; v=0; alitrackid=www.taobao.com; swfstore=164423; uc2=wuf=https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DyuLwEC5VzXl67TX71pM-Pt5ijrzvVp76FosboF3-bRXW_TIMHJDVmk32CWs6e7ra1SuZudckIhHttgRUS5bN_0MaK1Kv-zHpfr0mjFVBPh-Crtxj19HCnfLT8x4uC3p5%26wd%3D%26eqid%3Db1f8688b001197fa000000035668de3b; uc3=nk2=AmkbKafOx9I%3D\u0026amp;id2=UU8PbnneKzSx\u0026amp;vt3=F8dAScPiH8lvv%2FHL%2BUQ%3D\u0026amp;lg2=UIHiLt3xD8xYTw%3D%3D; existShop=MTQ0OTczNjI5Mw%3D%3D; lgc=axianzia; tracknick=axianzia; sg=a19; cookie2=18b3234c8a526abaa23707836bdb48db; mt=np=\u0026amp;ci=3_1; cookie1=U%2BItOIkoJQAKEBg7UNusLVWk5N%2Bjy%2B0nBH%2BLYA08k7o%3D; unb=277562651; skt=3815ab30afb05045; t=441f8ce4777a72bb7ea38953b39d0d4c; _cc_=U%2BGCWk%2F7og%3D%3D; tg=0; _l_g_=Ug%3D%3D; _nk_=axianzia; cookie17=UU8PbnneKzSx; uc1=cookie14=UoWzUGNgydE6eg%3D%3D\u0026amp;existShop=false\u0026amp;cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D\u0026amp;cym=1\u0026amp;cookie21=UIHiLt3xSi%2BtvZI3oKTk0Q%3D%3D\u0026amp;tag=1\u0026amp;cookie15=U%2BGCWk%2F75gdr5Q%3D%3D\u0026amp;pas=0; lastalitrackid=sec.taobao.com; _tb_token_=7838551b54eab; JSESSIONID=D618280BE80EDEC49CB088A621098A09; _m_h5_tk=75ed71e765ff08d7ffa6f07be4dbc4e4_1449744286222; _m_h5_tk_enc=bfdd680ef33e03310ba83a1c1d31628f; x=e%3D1%26p%3D*%26s%3D0%26c%3D0%26f%3D0%26g%3D0%26t%3D0%26__ll%3D-1%26_ato%3D0; whl=-1%260%260%261449741955290; isg=5DD9B27D498B0FDB607B9BAA2130DC57; l=AszMmdUv13i10ljT0/VejoQ6HCT-VHCv", )); curl_setopt($ch, CURLOPT_USERAGENT, \u0026#39;Mozilla/5.0 (Windows NT 10.0; WOW64; rv:41.0) Gecko/20100101 Firefox/41.0\u0026#39;); // 在HTTP请求头中"Referer: "的内容。 curl_setopt($ch, CURLOPT_REFERER,"https://s.taobao.com/search?q=" . $row . "\u0026amp;ie=utf8\u0026amp;app=detailproduct\u0026amp;through=1"); curl_setopt($ch, CURLOPT_ENCODING, "gzip, deflate, sdch"); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_TIMEOUT,120); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);//302redirect curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 2); $html = curl_exec($ch); curl_close($ch); if($html === false) { echo \u0026#39;Curl error: \u0026#39; . curl_error($ch) . "\u0026lt;br\u0026gt;\\\r"; // 抓取出错的数据status设置为0 $sql = "UPDATE `tb_product_list` SET `status` = \u0026#39;0\u0026#39; WHERE `id` = " . $row; $retval = mysql_query($sql, $con); if(!$retval) { print_r(\u0026#39;Could not connect: \u0026#39; . mysql_error() . " on line ".__LINE__."\u0026lt;br\u0026gt;\\\r"); } return NULL; } else { //正则表达式去除所有空格(包括换行 空格 \u0026amp;nbsp;) $html = preg_replace("/(\\s|\\\u0026amp;nbsp\\;| |\\xc2\\xa0)/","",$html); // 匹配js $pattern = \u0026#39;/\u0026lt;script\u0026gt;g_page_config=(.*?);g_srp_loadCss\\(\\).*?\u0026lt;\\/script\u0026gt;/si\u0026#39;; //抓取json数据 preg_match_all($pattern,$html,$result); if (empty($result)) { //返回False return NULL; } else { // 将json转为数组 $data = https://www.zhihu.com/api/v4/questions/38716137/json_decode($result); // print_r($data); // 获取商品数据 $res = $data-/u0026gt;mods-/u0026gt;itemlist-/u0026gt;data-/u0026gt;auctions; // 获取总页数 $res = empty($data-/u0026gt;mods-/u0026gt;pager-/u0026gt;data-/u0026gt;totalPage) ? 2 : $data-\u0026gt;mods-\u0026gt;pager-\u0026gt;data-\u0026gt;totalPage; //返回json数据 return $res; } }}
推荐阅读
- 关于用phpfsocket 写Post, 模拟http 报文怎样写入要传输的处理数据
- 新互联网网站用Java还靠谱么对比Php,Python,Ruby的话
- 求助!PHP开发,怎么样才能具备在有千万级流量网站开发环境下的基本工作能力
- PHP程序员岗位招聘面试题有哪些
- 特斯拉|特斯拉或将允许车主远程调用Autopilot摄像头
- 2017年各种语言的就业形势怎样(Android,iOS,PHP,Java,前端)
- Discuz、PHPWind之类BBS程序哪个运行打开最高效
- webpack打包后的页面,在集成时,怎样提供js接口给父页面调用呢
- 为啥PHP社区鲜有异步队列,分布式计算,数据分析的项目和讨论
- javascript 中this的四种调用模式