diff --git a/scrap_on_google.ipynb b/scrap_on_google.ipynb index 342400196ec6d2165701c42ff219244e70bb30af..6a10937062e02a859f17a2cbcad7c46557ff0c04 100644 --- a/scrap_on_google.ipynb +++ b/scrap_on_google.ipynb @@ -2,16 +2,16 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "The time recorded in the file is : 1617806155.160355\n", - "The time now is : 1617806461.4603753\n", - "A total difference of 5.105000336964925 minutes\n", + "The time recorded in the file is : 1621981702.890581\n", + "The time now is : 1621982397.891385\n", + "A total difference of 11.583346736431121 minutes\n", "It has been less than 15 minutes since the proxies were renewed, therefore sticking with the old proxies\n" ] } @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -60,7 +60,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -69,7 +69,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [], "source": [ @@ -124,7 +124,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -150,7 +150,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -180,7 +180,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -192,7 +192,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -218,7 +218,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -227,7 +227,7 @@ "\"def get_pdf_info(f):\\n #pp = pprint.PrettyPrinter(indent=4)\\n fd = PyPDF2.PdfFileReader(f, 'rb')\\n doc_info = fd.getDocumentInfo()\\n #pp.pprint(doc_info)\\n return format_to_iso_date(doc_info['/ModDate'])\"" ] }, - "execution_count": 9, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -243,7 +243,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -263,7 +263,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -282,7 +282,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 14, "metadata": {}, "outputs": [], "source": [ @@ -297,7 +297,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ @@ -320,7 +320,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -561,7 +561,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -632,7 +632,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 18, "metadata": {}, "outputs": [], "source": [ @@ -730,7 +730,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 19, "metadata": {}, "outputs": [], "source": [ @@ -828,7 +828,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -839,8 +839,8 @@ " links = soup.findAll(\"a\")\n", " url_list = []\n", " for link in links :\n", - " print(link)\n", - " print(\"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ\")\n", + " #print(link)\n", + " #print(\"QQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ\")\n", " link_href = link.get('href')\n", " if \"url?q=\" in link_href:# and not \"webcache\" in link_href:\n", " # print (link.get('href').split(\"?q=\")[1].split(\"&sa=U\")[0])\n", @@ -850,7 +850,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 21, "metadata": {}, "outputs": [], "source": [ @@ -883,8 +883,8 @@ "\n", " for requete in liste_requetes:\n", " #print('###', requete)\n", - " #response = search(requete, lang='fr', stop=10) # stop=10\n", - " response = run_requete(requete) # top@10 first pages\n", + " response = search(requete, lang='fr', stop=10) # stop=10\n", + " #response = run_requete(requete) # top@10 first pages\n", " # response = bing_search.search(requete) # stop=10\n", " try:\n", " #liste_url = list(response)\n", @@ -935,7 +935,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 22, "metadata": {}, "outputs": [], "source": [ @@ -972,8 +972,8 @@ "\n", " for requete in liste_requetes:\n", " #print('###', requete)\n", - " #response = search(requete, lang='fr', stop=10) # stop=10\n", - " response = run_requete(requete) # top@10 first pages\n", + " response = search(requete, lang='fr', stop=10) # stop=10\n", + " #response = run_requete(requete) # top@10 first pages\n", " # response = bing_search.search(requete) # stop=10\n", " try:\n", " #liste_url = list(response)\n",