{"id":3182,"date":"2018-11-25T02:23:26","date_gmt":"2018-11-24T18:23:26","guid":{"rendered":"https:\/\/tao0.date\/?p=3182"},"modified":"2018-11-25T02:30:14","modified_gmt":"2018-11-24T18:30:14","slug":"python%e7%88%ac%e8%99%abcoursera%e6%8a%93%e7%ab%99","status":"publish","type":"post","link":"https:\/\/tao0.date\/?p=3182","title":{"rendered":"Python\u722c\u866bCoursera\u6293\u7ad9"},"content":{"rendered":"<div>\n<h2>0.1. cookie\u5904\u7406<\/h2>\n<p>\u9700\u8981\u8fdb\u884c\u767b\u9646\u7684\u65f6\u5019, \u8981\u8fdb\u884ccookie\u7684\u5904\u7406,\u4f7f\u7528\u4ee5\u4e0b\u65b9\u6cd5<\/p>\n<pre class=\"hljs python\"><code class=\"python\">cookie = cookielib.CookieJar()\r\nopener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))\r\nurllib2.install_opener(opener)\r\nreq = urllib2.Request(url)\r\ncontent = urllib2.urlopen(req)\r\n<\/code><\/pre>\n<h2>0.2. \u8868\u5355\u5904\u7406<\/h2>\n<p>\u67d0\u4e9b\u7f51\u7ad9\u9700\u8981\u8fdb\u884c\u8d26\u6237\u548c\u5bc6\u7801\u767b\u9646, \u9700\u8981\u4f7f\u7528POST\u65b9\u6cd5\u5411\u670d\u52a1\u5668\u53d1\u9001\u8d26\u6237\u548c\u5bc6\u7801\u8868\u5355\u6570\u636e, \u8fd9\u91cc\u5c31\u9700\u8981\u6a21\u62df\u767b\u9646.<\/p>\n<blockquote><p>\u5982\u4f55\u83b7\u53d6\u8868\u5355\u6570\u636e\u7684\u683c\u5f0f\u5462?<\/p><\/blockquote>\n<p>\u901a\u8fc7\u8c37\u6b4c\u6d4f\u89c8\u5668\u5f00\u53d1\u8005\u5de5\u5177\u4e2dNetwork\u9501\u5b9a\u8bf7\u6c42\u5934\u90e8\u548cpost\u53d1\u51fa\u7684\u8868\u5355\u6570\u636e,\u4f2a\u88c5\u8868\u5355\u6570\u636e<\/p>\n<blockquote><p>\u5f53\u7531\u4e8eMethod\u592a\u591a, \u627e\u4e0d\u5230POST\u63d0\u4ea4\u767b\u5f55\u8bf7\u6c42Method\u65b9\u6cd5\u7684\u65f6\u5019, \u53ef\u4ee5\u5c1d\u8bd5\u4f7f\u7528\u9519\u8bef\u5bc6\u7801, \u8fd9\u6837\u5c31\u53ef\u4ee5\u5bb9\u6613\u7684\u627ePOST\u65b9\u6cd5\u5bf9\u5e94\u7684\u5934\u90e8.<\/p><\/blockquote>\n<div class=\"image-package\">\n<p><img decoding=\"async\" src=\"https:\/\/tao0.date\/wp-content\/uploads\/2018\/11\/940c1-snip20141207_2.png\" alt=\"\u627e\u5230POST\u65b9\u6cd5\" data-original-src=\"https:\/\/tao0.date\/wp-content\/uploads\/2018\/11\/940c1-snip20141207_2.png\" \/><\/p>\n<div class=\"image-caption\">\u627e\u5230POST\u65b9\u6cd5<\/div>\n<\/div>\n<div class=\"image-package\">\n<p><img decoding=\"async\" src=\"https:\/\/tao0.date\/wp-content\/uploads\/2018\/11\/3b2a8-snip20141207_3.png\" alt=\"\u8868\u5355\u5904\u7406\" data-original-src=\"https:\/\/tao0.date\/wp-content\/uploads\/2018\/11\/3b2a8-snip20141207_3.png\" \/><\/p>\n<div class=\"image-caption\">\u8868\u5355\u5904\u7406<\/div>\n<\/div>\n<pre class=\"hljs python\"><code class=\"python\">form_data = urllib.urlencode({  <span class=\"hljs-comment\">#\u6ce8\u610furlencode\u65b9\u6cd5<\/span>\r\n    <span class=\"hljs-string\">\"email\"<\/span>: self.user_name,\r\n    <span class=\"hljs-string\">\"password\"<\/span>: self.password,\r\n    <span class=\"hljs-string\">\"webrequest\"<\/span>: <span class=\"hljs-string\">\"true\"<\/span>\r\n})  \r\n<\/code><\/pre>\n<h2>0.3. \u9632\u76d7\u94fe\u548c\u4f2a\u88c5\u6210\u6d4f\u89c8\u5668\u8bbf\u95ee<\/h2>\n<p>\u9632\u76d7\u94fe\u5c31\u662f\u9700\u8981\u5728\u8bf7\u6c42\u7684\u5934\u90e8\u52a0\u5165<code>Referer<\/code>\u5b57\u6bb5, Referer \u6307\u7684\u662fHTTP\u5934\u90e8\u7684\u4e00\u4e2a\u5b57\u6bb5, \u7528\u6765\u8868\u793a\u4ece\u54ea\u513f\u94fe\u63a5\u5230\u76ee\u524d\u7684\u7f51\u9875\uff0c\u91c7\u7528\u7684\u683c\u5f0f\u662fURL\u3002\u6362\u53e5\u8bdd\u8bf4\uff0c\u501f\u7740 HTTP Referer \u5934\u90e8\u7f51\u9875\u53ef\u4ee5\u68c0\u67e5\u8bbf\u5ba2\u4ece\u54ea\u91cc\u800c\u6765\uff0c\u8fd9\u4e5f\u5e38\u88ab\u7528\u6765\u5bf9\u4ed8\u4f2a\u9020\u7684\u8de8\u7f51\u7ad9\u8bf7\u6c42\u3002<\/p>\n<p>\u4f2a\u88c5\u6210\u6d4f\u89c8\u5668\u5c31\u662f\u5c06<code>User-Agent\u8bbe\u7f6e\u4e3a\u6d4f\u89c8\u5668\u7684\u5b57\u6bb5<\/code><\/p>\n<div class=\"image-package\">\n<p><img decoding=\"async\" src=\"https:\/\/tao0.date\/wp-content\/uploads\/2018\/11\/286cb-snip20141207_5.png\" alt=\"\u4f2a\u88c5\u5934\u90e8\" data-original-src=\"https:\/\/tao0.date\/wp-content\/uploads\/2018\/11\/286cb-snip20141207_5.png\" \/><\/p>\n<div class=\"image-caption\">\u4f2a\u88c5\u5934\u90e8<\/div>\n<\/div>\n<pre class=\"hljs python\"><code class=\"python\">user_agent = (<span class=\"hljs-string\">\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_10_1) \"<\/span>\r\n    <span class=\"hljs-string\">\"AppleWebKit\/537.36 (KHTML, like Gecko) \"<\/span>\r\n    <span class=\"hljs-string\">\"Chrome\/38.0.2125.111 Safari\/537.36\"<\/span>)\r\nrequest_header = {\r\n    <span class=\"hljs-string\">\"Referer\"<\/span>: <span class=\"hljs-string\">\"https:\/\/accounts.coursera.org\/signin\"<\/span>,  <span class=\"hljs-comment\">#\u5bf9\u4ed8\u9632\u76d7\u94fe\u8bbe\u7f6e, \u4e3a\u8df3\u8f6c\u6765\u6e90\u7684url<\/span>\r\n    <span class=\"hljs-string\">\"User-Agent\"<\/span>: user_agent, <span class=\"hljs-comment\">#\u4f2a\u88c5\u6210\u6d4f\u89c8\u5668\u8bbf\u95ee<\/span>\r\n}\r\n<\/code><\/pre>\n<h1>1. \u4f2a\u88c5\u5934\u90e8<\/h1>\n<p>\u4f7f\u7528<code>\u8c37\u6b4c\u6d4f\u89c8\u5668\u81ea\u5e26\u7684\u5f00\u53d1\u8005\u5de5\u5177<\/code>, \u9009\u62e9<code>Network<\/code>(<code>Element\u7528\u6765\u67e5\u770b\u7f51\u7ad9\u6e90\u7801\u7b49\u529f\u80fd<\/code>), \u83b7\u53d6\u8be6\u7ec6\u7684GET\u548cPOST\u65b9\u6cd5, \u4ece\u4e2d\u83b7\u53d6<code>\u767b\u5f55\u8bf7\u6c42<\/code>\u7684\u7684\u5934\u90e8\u4fe1\u606f,<br \/>\n\u4ecePOST\u4e2d\u83b7\u5f97Headers\u4fe1\u606f\u5982\u4e0b(<code>\u7701\u7565\u90e8\u5206\u4e0d\u91cd\u8981\u4fe1\u606f<\/code>)<\/p>\n<pre class=\"hljs java\"><code class=\"java\">Request URL:https:<span class=\"hljs-comment\">\/\/accounts.coursera.org\/api\/v1\/login  \/\/\u771f\u6b63\u7684\u767b\u9646\u9a8c\u8bc1\u9875\u9762<\/span>\r\nRequest Method:POST\r\nStatus Code:<span class=\"hljs-number\">401<\/span> Unauthorized\r\n\r\n<span class=\"hljs-comment\">\/\/Request Headers<\/span>\r\nConnection:keep-alive\r\nContent-Length:<span class=\"hljs-number\">55<\/span>\r\nContent-Type:application\/x-www-form-urlencoded\r\nCookie:(\u7701\u7565cookie\u4fe1\u606f, \u4e0b\u9762\u8be6\u7ec6\u4ecb\u7ecd)\r\n...\r\nReferer:https:<span class=\"hljs-comment\">\/\/accounts.coursera.org\/signin \/\/\u9632\u76d7\u94fe\u8bbe\u7f6e<\/span>\r\nUser-Agent:Mozilla\/<span class=\"hljs-number\">5.0<\/span> (Windows NT <span class=\"hljs-number\">6.1<\/span>; WOW64) AppleWebKit\/<span class=\"hljs-number\">537.36<\/span> (KHTML, like Gecko) Chrome\/<span class=\"hljs-number\">39.0<\/span>.2171.71 Safari\/<span class=\"hljs-number\">537.36<\/span>  <span class=\"hljs-comment\">\/\/\u6d4f\u89c8\u5668\u6d4f\u89c8\u6807\u8bc6<\/span>\r\n<span class=\"hljs-comment\">\/\/\u4e0b\u9762\u56db\u884c\u4e3a\u670d\u52a1\u5668\u6240\u505a\u7684\u9650\u5236\u5b57\u6bb5<\/span>\r\nX-CSRF2-Cookie:csrf2_token_el67QDLg\r\nX-CSRF2-Token:<span class=\"hljs-number\">1<\/span>oxZDVMuZGX0qCggdReQyj2R\r\nX-CSRFToken:WnVtiMDpvw0JXJqHjPrFk0EU\r\nX-Requested-With:XMLHttpRequest\r\n\r\n<span class=\"hljs-comment\">\/\/Form Data<\/span>\r\nemail:<span class=\"hljs-number\">1095<\/span>...<span class=\"hljs-meta\">@qq<\/span>.com <span class=\"hljs-comment\">\/\/Coursera\u8d26\u6237\u4fe1\u606f <\/span>\r\npassword:FAFA  <span class=\"hljs-comment\">\/\/\u8d26\u6237\u5bc6\u7801<\/span>\r\nwebrequest:<span class=\"hljs-keyword\">true<\/span> <span class=\"hljs-comment\">\/\/\u56fa\u5b9a\u5b57\u6bb5<\/span>\r\n<\/code><\/pre>\n<p>\u8fd9\u6837\u5c31\u80fd\u5199\u51fa\u6a21\u62df\u5934\u90e8\u7684\u51fd\u6570<\/p>\n<pre class=\"hljs python\"><code class=\"python\"><span class=\"hljs-function\"><span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title\">structure_headers<\/span><span class=\"hljs-params\">(self)<\/span> :<\/span>\r\n        <span class=\"hljs-comment\">#\u6a21\u62df\u8868\u5355\u6570\u636e,\u8fd9\u4e2a\u53c2\u6570\u4e0d\u662f\u5b57\u5178<\/span>\r\n        form_data = urllib.urlencode({\r\n            <span class=\"hljs-string\">\"email\"<\/span>: self.user_name,\r\n            <span class=\"hljs-string\">\"password\"<\/span>: self.password,\r\n            <span class=\"hljs-string\">\"webrequest\"<\/span>: <span class=\"hljs-string\">\"true\"<\/span>\r\n        })  \r\n        user_agent = (<span class=\"hljs-string\">\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_10_1) \"<\/span>\r\n            <span class=\"hljs-string\">\"AppleWebKit\/537.36 (KHTML, like Gecko) \"<\/span>\r\n            <span class=\"hljs-string\">\"Chrome\/38.0.2125.111 Safari\/537.36\"<\/span>)\r\n        request_header = {\r\n            <span class=\"hljs-string\">\"Referer\"<\/span>: <span class=\"hljs-string\">\"https:\/\/accounts.coursera.org\/signin\"<\/span>,  <span class=\"hljs-comment\">#\u5bf9\u4ed8\u9632\u76d7\u94fe\u8bbe\u7f6e, \u4e3a\u8df3\u8f6c\u6765\u6e90\u7684url<\/span>\r\n            <span class=\"hljs-string\">\"User-Agent\"<\/span>: user_agent, <span class=\"hljs-comment\">#\u4f2a\u88c5\u6210\u6d4f\u89c8\u5668\u8bbf\u95ee<\/span>\r\n        }\r\n        <span class=\"hljs-keyword\">return<\/span> form_data, request_header\r\n<\/code><\/pre>\n<p>\u8bd5\u4e86\u51e0\u6b21\u7adf\u7136\u90fd\u662f<code>400\u9519\u8bef<\/code>, \u4e5f\u5c31\u662f\u5934\u90e8\u8bf7\u6c42\u7684\u683c\u5f0f\u4e0d\u6b63\u786e, \u901a\u8fc7\u591a\u6b21Headers\u67e5\u770b, \u53d1\u73b0\u6709\u4e0b\u9762\u56db\u5904\u4e0d\u540c\u7684\u5934\u90e8<\/p>\n<pre class=\"hljs css\"><code class=\"css\"><span class=\"hljs-selector-tag\">X-CSRF2-Cookie<\/span><span class=\"hljs-selector-pseudo\">:csrf2_token_hTu4Zy8Y<\/span>  \u6700\u540e\u516b\u4f4d\u4e0d\u540c\r\n<span class=\"hljs-selector-tag\">X-CSRF2-Token<\/span><span class=\"hljs-selector-pseudo\">:O5OIRan9I99lTHmnYS27ocYb<\/span>  \u5b8c\u5168\u968f\u673a\r\n<span class=\"hljs-selector-tag\">X-CSRFToken<\/span><span class=\"hljs-selector-pseudo\">:HClYbs9HZoGweU54iR5r5z2y<\/span> \u5b8c\u5168\u968f\u673a\r\n<span class=\"hljs-selector-tag\">X-Requested-With<\/span><span class=\"hljs-selector-pseudo\">:XMLHttpRequest<\/span>  \u56fa\u5b9a\u4e0d\u53d8\r\n<\/code><\/pre>\n<blockquote><p>\u901a\u8fc7\u653e\u4e0a\u641c\u7d22\u627e\u5230\u4e86\u89e3\u51b3\u65b9\u6848, coursera\u7684\u8bf7\u6c42\u5934\u90e8\u4e2d<code>X-CSRF2-Token\u548cX-CSRFToken<\/code>\u662f\u5b8c\u5168\u968f\u673a\u7684, <code>X-CSRF2-Cookie<\/code>\u540e\u516b\u4f4d\u662f\u968f\u673a\u751f\u6210\u7684, \u90fd\u662f\u7531\u5b57\u6bcd\u548c\u6570\u5b57\u968f\u673a\u751f\u6210\u7684.<\/p><\/blockquote>\n<p>\u4e8e\u662f\u4fee\u6539\u4ee3\u7801\u5982\u4e0b:<\/p>\n<pre class=\"hljs python\"><code class=\"python\">    <span class=\"hljs-function\"><span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title\">structure_headers<\/span><span class=\"hljs-params\">(self)<\/span> :<\/span>\r\n        <span class=\"hljs-comment\">#\u6a21\u62df\u8868\u5355\u6570\u636e,\u8fd9\u4e2a\u53c2\u6570\u4e0d\u662f\u5b57\u5178<\/span>\r\n        form_data = urllib.urlencode({\r\n            <span class=\"hljs-string\">\"email\"<\/span>: self.user_name,\r\n            <span class=\"hljs-string\">\"password\"<\/span>: self.password,\r\n            <span class=\"hljs-string\">\"webrequest\"<\/span>: <span class=\"hljs-string\">\"true\"<\/span>\r\n        })  \r\n        user_agent = (<span class=\"hljs-string\">\"Mozilla\/5.0 (Macintosh; Intel Mac OS X 10_10_1) \"<\/span>\r\n            <span class=\"hljs-string\">\"AppleWebKit\/537.36 (KHTML, like Gecko) \"<\/span>\r\n            <span class=\"hljs-string\">\"Chrome\/38.0.2125.111 Safari\/537.36\"<\/span>)\r\n        XCSRF2Cookie = <span class=\"hljs-string\">'csrf2_token_%s'<\/span> % <span class=\"hljs-string\">''<\/span>.join(self.random_string(<span class=\"hljs-number\">8<\/span>))\r\n        XCSRF2Token = <span class=\"hljs-string\">''<\/span>.join(self.random_string(<span class=\"hljs-number\">24<\/span>))\r\n        XCSRFToken = <span class=\"hljs-string\">''<\/span>.join(self.random_string(<span class=\"hljs-number\">24<\/span>))\r\n        cookie = <span class=\"hljs-string\">\"csrftoken=%s; %s=%s\"<\/span> % (XCSRFToken, XCSRF2Cookie, XCSRF2Token)\r\n\r\n        request_header = {\r\n            <span class=\"hljs-string\">\"Referer\"<\/span>: <span class=\"hljs-string\">\"https:\/\/accounts.coursera.org\/signin\"<\/span>,  <span class=\"hljs-comment\">#\u5bf9\u4ed8\u9632\u76d7\u94fe\u8bbe\u7f6e, \u4e3a\u8df3\u8f6c\u6765\u6e90\u7684url<\/span>\r\n            <span class=\"hljs-string\">\"User-Agent\"<\/span>: user_agent, <span class=\"hljs-comment\">#\u4f2a\u88c5\u6210\u6d4f\u89c8\u5668\u8bbf\u95ee<\/span>\r\n            <span class=\"hljs-string\">\"X-Requested-With\"<\/span>: <span class=\"hljs-string\">\"XMLHttpRequest\"<\/span>,\r\n            <span class=\"hljs-string\">\"X-CSRF2-Cookie\"<\/span>: XCSRF2Cookie,\r\n            <span class=\"hljs-string\">\"X-CSRF2-Token\"<\/span>: XCSRF2Token,\r\n            <span class=\"hljs-string\">\"X-CSRFToken\"<\/span>: XCSRFToken,\r\n            <span class=\"hljs-string\">\"Cookie\"<\/span>: cookie\r\n        }\r\n        <span class=\"hljs-keyword\">return<\/span> form_data, request_header\r\n\r\n    <span class=\"hljs-function\"><span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title\">random_string<\/span><span class=\"hljs-params\">(self, length)<\/span>:<\/span>\r\n        <span class=\"hljs-keyword\">return<\/span> <span class=\"hljs-string\">''<\/span>.join(random.choice(string.letters + string.digits) <span class=\"hljs-keyword\">for<\/span> i <span class=\"hljs-keyword\">in<\/span> xrange(length))\r\n<\/code><\/pre>\n<h1>2. \u6a21\u62df\u767b\u9646<\/h1>\n<p>\u767b\u9646coursra\u7684\u4e0b\u8f7d\u9875\u9762\u65f6<a href=\"https:\/\/link.jianshu.com?t=https:\/\/class.coursera.org\/pkuco-001\/lecture\" target=\"_blank\" rel=\"nofollow noopener\">\u8ba1\u7b97\u673a\u7ec4\u6210\u89c6\u9891\u4e0b\u8f7d<\/a>, \u4f1a\u53d1\u73b0\u662f\u8981\u6c42\u767b\u9646\u5462, \u8fd9\u65f6\u5019\u5c31\u4f7f\u7528<code>cookielib<\/code>\u6a21\u5757\u8fdb\u884ccookie\u7684\u5904\u7406<\/p>\n<pre class=\"hljs python\"><code class=\"python\"><span class=\"hljs-function\"><span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title\">simulation_login<\/span><span class=\"hljs-params\">(self)<\/span> :<\/span>\r\n    cookie = cookielib.CookieJar()\r\n    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))\r\n    urllib2.install_opener(opener)\r\n    form_data, request_header = self.structure_headers()\r\n    req = urllib2.Request(self.login_url, data = form_data, headers = request_header)\r\n    <span class=\"hljs-keyword\">try<\/span> :\r\n        result = urllib2.urlopen(req)\r\n    <span class=\"hljs-keyword\">except<\/span> urllib2.URLError,e :\r\n        <span class=\"hljs-keyword\">if<\/span> hasattr(e, <span class=\"hljs-string\">\"code\"<\/span>):\r\n            <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"The server couldn't fulfill the request.Please check your url and read the Reason\"<\/span>\r\n            <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"Error code: %s\"<\/span> % e.code\r\n        <span class=\"hljs-keyword\">elif<\/span> hasattr(e, <span class=\"hljs-string\">\"reason\"<\/span>):\r\n            <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"We failed to reach a server. Please check your url and read the Reason\"<\/span>\r\n            <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"Reason: %s\"<\/span> % e.reason\r\n        sys.exit(<span class=\"hljs-number\">2<\/span>)\r\n    <span class=\"hljs-keyword\">if<\/span> result.getcode() == <span class=\"hljs-number\">200<\/span> :\r\n        <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"\u767b\u9646\u6210\u529f...\"<\/span>\r\n\r\n<\/code><\/pre>\n<p>\u8fd9\u4e2a\u51fd\u6570\u7528\u4e8e\u6a21\u62df\u767b\u9646, \u5e76\u663e\u793a\u767b\u9646\u6210\u529f\u6216\u8005\u5931\u8d25<\/p>\n<h1>3. \u6293\u53d6\u4e0b\u8f7d\u94fe\u63a5<\/h1>\n<p>\u6293\u53d6\u94fe\u63a5\u901a\u8fc7\u6b63\u5219\u8868\u8fbe\u5f0f, \u4e3b\u8981\u5339\u914d<code>PDF\u4e0b\u8f7d\u94fe\u63a5\u548cMP4\u89c6\u9891\u4e0b\u8f7d\u94fe\u63a5<\/code><\/p>\n<p>\u4f7f\u7528<code>re.findall()<\/code>\u51fd\u6570\u8fdb\u884c\u51fd\u6570\u5339\u914d<\/p>\n<pre class=\"hljs python\"><code class=\"python\"><span class=\"hljs-function\"><span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title\">get_links<\/span><span class=\"hljs-params\">(self)<\/span> :<\/span>\r\n    <span class=\"hljs-keyword\">try<\/span> :\r\n        result = urllib2.urlopen(self.url)\r\n    <span class=\"hljs-keyword\">except<\/span> urllib2.URLError,e :\r\n        <span class=\"hljs-keyword\">if<\/span> hasattr(e, <span class=\"hljs-string\">\"code\"<\/span>):\r\n            <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"The server couldn't fulfill the request.\"<\/span>\r\n            <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"Error code: %s\"<\/span> % e.code\r\n        <span class=\"hljs-keyword\">elif<\/span> hasattr(e, <span class=\"hljs-string\">\"reason\"<\/span>):\r\n            <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"We failed to reach a server. Please check your url and read the Reason\"<\/span>\r\n            <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"Reason: %s\"<\/span> % e.reason\r\n        sys.exit(<span class=\"hljs-number\">2<\/span>)\r\n    content = result.read().decode(<span class=\"hljs-string\">\"utf-8\"<\/span>)\r\n    <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"\u8bfb\u53d6\u7f51\u9875\u6210\u529f...\"<\/span>\r\n    down_links = re.findall(<span class=\"hljs-string\">r'&lt;a.*?href=\"(.*?mp4.*?)\"'<\/span>, content)\r\n    down_pdfs = re.findall(<span class=\"hljs-string\">r'&lt;a.*?href=\"(.*?pdf)\"'<\/span>, content)\r\n    <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"\u6b63\u5219\u5339\u914d\u7ed3\u675f...\"<\/span>\r\n    <span class=\"hljs-keyword\">return<\/span> down_links, down_pdfs\r\n\r\n<\/code><\/pre>\n<h1>4. \u5199\u5165\u5b8c\u672c<\/h1>\n<p>\u65e2\u7136\u5df2\u7ecf\u5339\u914d\u4e86\u6240\u6709\u7684\u94fe\u63a5, \u8fd9\u4e00\u6b65\u5c31\u662f\u5c06\u94fe\u63a5\u5199\u5165\u5230\u6587\u4ef6\u4e2d<\/p>\n<pre class=\"hljs python\"><code class=\"python\"><span class=\"hljs-function\"><span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title\">start_spider<\/span><span class=\"hljs-params\">(self)<\/span> :<\/span>\r\n    self.simulation_login()\r\n    down_links, down_pdfs = self.get_links()\r\n    <span class=\"hljs-keyword\">with<\/span> open(<span class=\"hljs-string\">\"coursera.html\"<\/span>, <span class=\"hljs-string\">\"w+\"<\/span>) <span class=\"hljs-keyword\">as<\/span> my_file :\r\n        <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"\u4e0b\u8f7d\u94fe\u63a5\u7684\u957f\u5ea6\"<\/span>, len(down_links)\r\n        <span class=\"hljs-keyword\">for<\/span> link <span class=\"hljs-keyword\">in<\/span> down_links :\r\n            <span class=\"hljs-keyword\">print<\/span> link\r\n            <span class=\"hljs-keyword\">try<\/span> :\r\n                my_file.write(link + <span class=\"hljs-string\">\"\\n\"<\/span>)\r\n            <span class=\"hljs-keyword\">except<\/span> UnicodeEncodeError:\r\n                sys.exit(<span class=\"hljs-number\">2<\/span>)\r\n    <span class=\"hljs-keyword\">with<\/span> open(<span class=\"hljs-string\">\"coursera.pdf\"<\/span>, <span class=\"hljs-string\">\"w+\"<\/span>) <span class=\"hljs-keyword\">as<\/span> my_file :\r\n        <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"\u4e0b\u8f7dpdf\u7684\u957f\u5ea6\"<\/span>, len(down_pdfs)\r\n        <span class=\"hljs-keyword\">for<\/span> pdf <span class=\"hljs-keyword\">in<\/span> down_pdfs :\r\n            <span class=\"hljs-keyword\">try<\/span> :\r\n                my_file.write(pdf + <span class=\"hljs-string\">\"\\n\"<\/span>)\r\n            <span class=\"hljs-keyword\">except<\/span> UnicodeEncodeError :\r\n                sys.exit(<span class=\"hljs-number\">2<\/span>)\r\n    <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"\u6293\u53d6Coursera\u8bfe\u7a0b\u4e0b\u8f7d\u94fe\u63a5\u548cpdf\u94fe\u63a5\u6210\u529f\"<\/span>\r\n<\/code><\/pre>\n<h1>5. \u767b\u9646\u8c03\u7528<\/h1>\n<p>\u4f7f\u7528<code>getpass\u6a21\u5757\u4e2d\u7684getpass()<\/code>\u8f93\u5165\u5bc6\u7801, \u4f7f\u7528\u8fd9\u4e2a\u51fd\u6570\u8f93\u5165\u5bc6\u7801\u7684\u65f6\u5019\u4e0d\u4f1a\u663e\u793a\u4efb\u4f55\u5b57\u7b26, \u8c8c\u4f3c\u4f53\u9a8c\u4e0d\u597d, \u4e0b\u6b21\u4fee\u6539\u4e00\u4e0b, \u7136\u540e\u547d\u4ee4\u884c\u4f20\u5165\u4e0b\u8f7d\u8bfe\u7a0b\u7684\u53c2\u6570.<\/p>\n<p>\u901a\u8fc7\u6bd4\u8f83\u6bcf\u95e8\u8bfe\u4e0b\u8f7d\u9875\u9762, \u53d1\u73b0\u4e4b\u540e<code>\/lecture<\/code>\u524d\u7684\u8fd9\u4e2a\u5b57\u6bb5\u662f\u4e0d\u540c\u7684,\u8fd9\u6837\u53ef\u4ee5\u8bbe\u7f6e\u901a\u8fc7\u547d\u4ee4\u884c\u4f20\u5165\u8fd9\u8fd9\u4e2a\u53c2\u6570<br \/>\n<a href=\"https:\/\/link.jianshu.com?t=https:\/\/class.coursera.org\/pkuco-001\/lecture\" target=\"_blank\" rel=\"nofollow noopener\">https:\/\/class.coursera.org\/pkuco-001\/lecture<\/a><br \/>\n<a href=\"https:\/\/link.jianshu.com?t=https:\/\/class.coursera.org\/electromagnetism-001\/lecture\" target=\"_blank\" rel=\"nofollow noopener\">https:\/\/class.coursera.org\/electromagnetism-001\/lecture<\/a><\/p>\n<blockquote><p>\u8fd9\u91cc\u5728\u8fd0\u884c\u547d\u4ee4\u884c\u53ea\u9700\u8981\u4f20\u5165<code>pkuco-001\u6216\u8005electromagnetism-001<\/code>\u8fd9\u79cd\u5b57\u6bb5<\/p><\/blockquote>\n<pre class=\"hljs python\"><code class=\"python\"><span class=\"hljs-function\"><span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title\">main<\/span><span class=\"hljs-params\">()<\/span> :<\/span>\r\n    <span class=\"hljs-keyword\">if<\/span> len(sys.argv) != <span class=\"hljs-number\">2<\/span> :\r\n        <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"Please Input what course you want to download..\"<\/span>\r\n        sys.exit(<span class=\"hljs-number\">2<\/span>)\r\n    url = <span class=\"hljs-string\">\"https:\/\/class.coursera.org\/{course}\/lecture\"<\/span>\r\n\r\n    user_name = raw_input(<span class=\"hljs-string\">\"Input your Email &gt; \"<\/span>)\r\n    password = getpass.getpass(<span class=\"hljs-string\">\"Input your Password &gt; \"<\/span>)\r\n    spider = Coursera(url.format(course = sys.argv[<span class=\"hljs-number\">1<\/span>]), user_name, password)\r\n    spider.start_spider()\r\n<\/code><\/pre>\n<h1>6. \u4e0b\u8f7d\u811a\u672c<\/h1>\n<p>\u4e0b\u8f7d\u53ef\u4ee5\u4f7f\u7528<code>curl<\/code>, Mac\u4e0b\u5b89\u88c5\u65b9\u6cd5<\/p>\n<pre class=\"hljs undefined\"><code>brew install curl\r\n<\/code><\/pre>\n<p>\u7f16\u5199\u4e0b\u8f7d\u4f7f\u7528\u7684Python\u811a\u672c, \u7f16\u5199\u6210\u529f\u540e, \u4fee\u6539\u811a\u672c\u6587\u4ef6\u7684\u6587\u4ef6\u6743\u9650<\/p>\n<pre class=\"hljs bash\"><code class=\"bash\">chmod 755  downloadshell.py <span class=\"hljs-comment\">#\u6700\u540e\u4e00\u4e2a\u53c2\u6570\u4e3aPython\u811a\u672c\u540d\u79f0<\/span>\r\n<\/code><\/pre>\n<p>\u8fd0\u884c\u4e0b\u8f7d\u811a\u672c<\/p>\n<pre class=\"hljs bash\"><code class=\"bash\"><span class=\"hljs-variable\">$python<\/span> downloadshell.py coursera.pdf <span class=\"hljs-comment\">#\u6700\u540e\u4e00\u4e2a\u53c2\u6570\u4e3a\u8fde\u63a5\u4fdd\u5b58\u6587\u4ef6<\/span>\r\n<\/code><\/pre>\n<p>\u4e0b\u9762\u662f\u6211\u81ea\u5df1\u7f16\u5199\u7684\u7b80\u5355\u7684\u4e0b\u8f7d\u811a\u672c<\/p>\n<pre class=\"hljs python\"><code class=\"python\"><span class=\"hljs-comment\">#!\/usr\/bin\/python2<\/span>\r\n<span class=\"hljs-comment\"># -*- coding:utf-8 -*-<\/span>\r\n<span class=\"hljs-comment\">#\u7b80\u5355\u7684\u6587\u4ef6\u4e0b\u8f7d\u811a\u672c<\/span>\r\n\r\n<span class=\"hljs-keyword\">import<\/span> os\r\n<span class=\"hljs-keyword\">import<\/span> sys, re\r\n\r\n<span class=\"hljs-function\"><span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title\">read_file<\/span><span class=\"hljs-params\">(file_name)<\/span> :<\/span>\r\n    down_links = []\r\n    <span class=\"hljs-keyword\">with<\/span> open(file_name, <span class=\"hljs-string\">\"r\"<\/span>) <span class=\"hljs-keyword\">as<\/span> my_file :\r\n        <span class=\"hljs-keyword\">for<\/span> url <span class=\"hljs-keyword\">in<\/span> my_file :\r\n            down_links.append(url.replace(<span class=\"hljs-string\">\"\\n\"<\/span>, <span class=\"hljs-string\">\"\"<\/span>))\r\n    <span class=\"hljs-keyword\">return<\/span> down_links\r\n\r\n<span class=\"hljs-function\"><span class=\"hljs-keyword\">def<\/span> <span class=\"hljs-title\">main<\/span><span class=\"hljs-params\">()<\/span> :<\/span>\r\n    <span class=\"hljs-keyword\">if<\/span> len(sys.argv) != <span class=\"hljs-number\">2<\/span> :\r\n        <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"Please input file name...\"<\/span>\r\n        sys.exit(<span class=\"hljs-number\">2<\/span>)\r\n    down_links = read_file(sys.argv[<span class=\"hljs-number\">1<\/span>])\r\n    pdf_index = <span class=\"hljs-number\">1<\/span>\r\n    mp4_index = <span class=\"hljs-number\">1<\/span>\r\n    <span class=\"hljs-keyword\">for<\/span> index, link <span class=\"hljs-keyword\">in<\/span> enumerate(down_links) :\r\n        <span class=\"hljs-keyword\">if<\/span> link.find(<span class=\"hljs-string\">\"mp4\"<\/span>) != <span class=\"hljs-number\">-1<\/span> :\r\n            os.system(<span class=\"hljs-string\">\"curl \"<\/span> + link + <span class=\"hljs-string\">\" -o \"<\/span> + <span class=\"hljs-string\">\"%d.mp4\"<\/span> % mp4_index)\r\n        <span class=\"hljs-keyword\">elif<\/span> link.find(<span class=\"hljs-string\">\"pdf\"<\/span>) != <span class=\"hljs-number\">-1<\/span> :\r\n            os.system(<span class=\"hljs-string\">\"curl \"<\/span> + link + <span class=\"hljs-string\">\" -o \"<\/span> + <span class=\"hljs-string\">\"%d.pdf\"<\/span> % pdf_index)\r\n            pdf_index += <span class=\"hljs-number\">1<\/span>\r\n        <span class=\"hljs-keyword\">else<\/span> :\r\n            <span class=\"hljs-keyword\">print<\/span> <span class=\"hljs-string\">\"\u8bf7\u81ea\u884c\u8865\u5168\u4e0b\u8f7d\u547d\u4ee4...\"<\/span>\r\n\r\n<span class=\"hljs-keyword\">if<\/span> __name__ == <span class=\"hljs-string\">'__main__'<\/span>:\r\n    main()\r\n<\/code><\/pre>\n<h1>7.\u5b8c\u6574\u4ee3\u7801\u67e5\u770b<\/h1>\n<p><a href=\"https:\/\/link.jianshu.com?t=https:\/\/github.com\/Andrew-liu\/coursera_spider\" target=\"_blank\" rel=\"nofollow noopener\">Github\u5b8c\u6574\u4ee3\u7801<\/a><\/p>\n<\/div>\n<p>&nbsp;<\/p>\n","protected":false},"excerpt":{"rendered":"<p>0.1. cookie\u5904\u7406 \u9700\u8981\u8fdb\u884c\u767b\u9646\u7684\u65f6\u5019, &hellip;<\/p>\n","protected":false},"author":1,"featured_media":3184,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[29],"tags":[],"class_list":["post-3182","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-website-document"],"_links":{"self":[{"href":"https:\/\/tao0.date\/index.php?rest_route=\/wp\/v2\/posts\/3182","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/tao0.date\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/tao0.date\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/tao0.date\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/tao0.date\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=3182"}],"version-history":[{"count":0,"href":"https:\/\/tao0.date\/index.php?rest_route=\/wp\/v2\/posts\/3182\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/tao0.date\/index.php?rest_route=\/wp\/v2\/media\/3184"}],"wp:attachment":[{"href":"https:\/\/tao0.date\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=3182"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/tao0.date\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=3182"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/tao0.date\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=3182"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}