"inurl: burada bir şey" komutunu içeren bir değişkene bir dizgeyi iletmem gerekiyor, bu nedenle bu terimi, URL'de bu terimi temel alan gelişmiş bir Google araması yapabilirim.Python dizesinde bir kolon nasıl eklenir?
Örneğin, benim program otomatik Google arama programına bu değişkeni geçirir:
search_terms = 'python developer inurl:jobs'
Ama bu işe yaramazsa, ben kolona problemde daraltmıştır. Bu ifadeyi "inurl: her neyse" dizgesini, dizgedeki her şey gibi bir metin parçası olarak, programlama simgesi olarak python yorumlamadan nasıl ekleyebilirim? Ben köşeli parantez, parantez ve yığın sorular, vb dahil düşünebileceğim her şeyi denedim. Teşekkürler!
Daha fazla kod, istendiği gibi: Belki de dosyaya yazdığımda hata alıyorum? Program e-posta için görünen bir işlevi aracılığıyla sonra
import requests,re,bs4
from selenium import webdriver
from bs4 import BeautifulSoup
import time,random
search_terms = 'python developer inurl:jobs'
added_terms = 'contact email'
... sonra Not Defteri dosyasına yazma gerekiyordu (tamam çalışıyor biliyorum bu):
number_of_sites = 1 #NUMBER OF SITES (SEARCH RESULTS) TO PARSE FOR EMAILS
number_of_search_pages = 1
def google_this_for_emails(): #Googles and gets the first few links
global scrapedEmails
scrapedEmails = []
global emails_not_found
emails_not_found = []
#This searches for certain keywords in Google and parses results with BS
for el in search_terms:
webpage = 'http://google.com/search?q=' + str(el) + str(added_terms)
print('\n Searching for the terms...', el,added_terms)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
serp_res_rawlink = soup.select('.r a')
dicti = [] #This gets the href links
for link in serp_res_rawlink:
url = link.get('href')
if 'pdf' not in url:
dicti.append(url)
dicti_url = [] #This cleans the "url?q=" from link
for el in dicti:
if '/url?q=' in el:
result = (el.strip('/url?q='))
dicti_url.append(result)
#print(dicti_url)
global dicti_pretty_links
dicti_pretty_links = [] #This cleans the gibberish at end of url
for el in dicti_url[0:(number_of_sites)]:
pretty_url = el.partition('&')[0]
dicti_pretty_links.append(pretty_url)
print(dicti_pretty_links)
for el in dicti_pretty_links:
#######START OF THE BS CHECK FOR EMAILS BY REGEX #################
#This opens page in BS for parsing emails
webpage = (el)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
#if "</form>" in soup:
#This is the first way to search for an email in soup, "MO"
emailRegex = re.compile(r'([a-zA-Z0-9_.+][email protected][a-zA-Z0-9_.+.+]+)', re.VERBOSE)
mo = emailRegex.findall(res.text)
print('THIS BELOW IS MO')
print(mo,'EMAILS COMING FROM: ',el)
for el in mo:
if el not in scrapedEmails:
scrapedEmails.append(el)
#This is the second way to search for an email in soup, "MAILTOS":
# mailtos = soup.select('a[href^=mailto]')
# print('THIS BELOW IS MAILTOS')
# print(mailtos, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
#
# dicti_cleaner = []
# target = re.compile(r'mailto')
# for el in mailtos:
# mo = target.search(str(el))
# dicti_cleaner.append(el)
#
# temp = []
# for el in dicti_cleaner:
# pretty_url = str(el).partition(':')[2]
# second_url = str(pretty_url).partition('"')[0]
# temp.append(second_url)
#
# for el in temp:
# if el not in scrapedEmails:
# scrapedEmails.append(el)
# #######END OF THE BS CHECK FOR EMAILS BY REGEX #################
for el in dicti_pretty_links:
#######START OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
browser = webdriver.Firefox() #This converts page into Selenium object
page = browser.get(el)
time.sleep(random.uniform(0.5,1.5))
try: #Tries to open "contact" link
contact_link = browser.find_element_by_partial_link_text('ontact')
if contact_link:
contact_link.click()
except:
pass #Silently ignores exception
html = browser.page_source #Loads up the page for Regex search
soup = BeautifulSoup(html,'lxml')
time.sleep(random.uniform(0.5,1.5))
emailRegex = re.compile(r'([a-zA-Z0-9_.+][email protected][a-zA-Z0-9_.+.+]+)', re.VERBOSE)
mo = emailRegex.findall(html)
print('THIS BELOW IS SEL_emails_MO for',el)
print(mo,'EMAILS COMING FROM: ',el)
if not mo:
print('no emails found in ',el)
emails_not_found.append(el)
for el in mo:
if el not in scrapedEmails: #Checks if emails is/adds to ddbb
scrapedEmails.append(el)
browser.close()
#######END OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
print('EMAILS SCRAPED SO FAR: \n', scrapedEmails)
time.sleep(random.uniform(0.5,1.5)) #INSERTS HUMAN-LIKE RANDOM DELAY
def google_nextpage_for_emails(): #Googles and gets the first few links
print(60*'-')
print('STARTING FUNCTION NEXTPAGE FOR EMAILS')
counter = 10
for i in range(0,(number_of_search_pages)):
#This searches for certain keywords in Google and parses results with BS
for el in search_terms:
webpage = 'https://www.google.com/search?q='+str(el)+str(added_terms)+'&start='+str(counter)
print('\n Searching for the terms...', el,added_terms, 'on', webpage)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
#res.raise_for_status()
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
serp_res_rawlink = soup.select('.r a')
dicti = [] #This gets the href links
for link in serp_res_rawlink:
url = link.get('href')
if 'pdf' not in url:
dicti.append(url)
dicti_url = [] #This cleans the "url?q=" from link
for el in dicti:
if '/url?q=' in el:
result = (el.strip('/url?q='))
dicti_url.append(result)
#print(dicti_url)
global dicti_pretty_links
dicti_pretty_links = [] #This cleans the gibberish at end of url
for el in dicti_url[0:(number_of_sites)]:
pretty_url = el.partition('&')[0]
dicti_pretty_links.append(pretty_url)
print(dicti_pretty_links)
for el in dicti_pretty_links:
#######START OF THE BS CHECK FOR EMAILS BY REGEX #################
#This opens page in BS for parsing emails
webpage = (el)
headers = {'User-agent':'Mozilla/5.0'}
res = requests.get(webpage, headers=headers)
statusCode = res.status_code
if statusCode == 200:
soup = bs4.BeautifulSoup(res.text,'lxml')
#if "</form>" in soup:
#This is the first way to search for an email in soup, "MO"
emailRegex = re.compile(r'([a-zA-Z0-9_.+][email protected][a-zA-Z0-9_.+.+]+)', re.VERBOSE)
mo = emailRegex.findall(res.text)
print('THIS BELOW IS MO')
print(mo, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
for el in mo:
if el not in scrapedEmails:
scrapedEmails.append(el)
#This is the second way to search for an email in soup, "MAILTOS":
# mailtos = soup.select('a[href^=mailto]')
# print('THIS BELOW IS MAILTOS')
# print(mailtos, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
#
# dicti_cleaner = []
# target = re.compile(r'mailto')
# for el in mailtos:
# mo = target.search(str(el))
# dicti_cleaner.append(el)
#
# temp = []
# for el in dicti_cleaner:
# pretty_url = str(el).partition(':')[2]
# second_url = str(pretty_url).partition('"')[0]
# temp.append(second_url)
#
# for el in temp:
# if el not in scrapedEmails:
# scrapedEmails.append(el)
# #######END OF THE BS CHECK FOR EMAILS BY REGEX #################
try:
for el in dicti_pretty_links:
#######START OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
browser = webdriver.Firefox() #This converts page into Selenium object
page = browser.get(el)
time.sleep(random.uniform(1,2))
try: #Tries to open "contact" link
contact_link = browser.find_element_by_partial_link_text('ontact')
if contact_link:
contact_link.click()
except Exception as e:
print (e)
continue
#pass #Silently ignores exception
html = browser.page_source #Loads up the page for Regex search
soup = BeautifulSoup(html,'lxml')
time.sleep(random.uniform(1,2))
emailRegex = re.compile(r'([a-zA-Z0-9_.+][email protected][a-zA-Z0-9_.+.+]+)', re.VERBOSE)
mo = emailRegex.findall(html)
print('THIS BELOW IS SEL_emails_MO for',el)
print(mo, el, 'THIS IS THE WEBSITE IT IS COMING FROM')
if not mo:
print('no emails found in ',el)
emails_not_found.append(el)
for el in mo:
if el not in scrapedEmails: #Checks if emails is/adds to ddbb
scrapedEmails.append(el)
browser.close()
#######END OF THE SELENIUM CHECK FOR "CONTACT" PAGES #################
except Exception as e:
print(e)
continue
counter += 10
time.sleep(random.uniform(1,2.5)) #INSERTS HUMAN-LIKE RANDOM DELAY
print('EMAILS SCRAPED SO FAR \n', scrapedEmails)
report()
def open_emails_lost():
for el in emails_not_found:
print(el)
browser = webdriver.Firefox() #This converts page into Selenium object
try:
browser.get(el)
time.sleep(random.uniform(1,2))
except:
pass
def report():
filename = (str(search_terms)+str('_')+str(added_terms))
testFile = open(filename + '.txt', 'w')
#testFile = open('test_google_tabs.txt', 'w')
testFile.write('SEARCH: ')
testFile.write(str(search_terms).upper())
testFile.write(str(added_terms).upper())
testFile.write('\n')
testFile.write(str(len(search_terms)))
testFile.write(' Google result parsed')
testFile.write('\n')
testFile.write(str(len(scrapedEmails)))
testFile.write(' emails found')
testFile.write('\n')
testFile.write(60*'*')
testFile.write('\n')
testFile.write(str(scrapedEmails)[1:-1]) #last part deletes the square brakets
testFile.write('\n')
testFile.write('\n')
testFile.write(str('And these below are the pages were emails were not found_____________'))
testFile.write('\n')
testFile.write(str(emails_not_found)[1:-1])
testFile.close()
#print('The information has been successfully written to "test_google_tabs.txt"')
print('The information has been successfully written to', filename)
print(60*'-')
google_this_for_emails()
google_nextpage_for_emails()
report()
open = input('Press any key to open the webpages that did not contain email addresses, or type "quit" to end program')
if open == 'quit':
pass
else:
open_emails_lost()
Anlamsız. Kesinlikle bir Python dizesinde herhangi bir yere bir kolon koyabilirsiniz. Kodunuzu gösterin, böylece neyin yanlış olduğunu görebiliriz. –
Size daha fazla bilgi ekledim. Mümkün olduğunca kısa tutmaya çalışıyordum çünkü Stack bana sık sık mümkün olan minimum kodu kullanmam gerektiğini söylüyor. – skeitel
@skeitel, neyin yanlış olduğunu anlayabilmemiz için her zaman bir [MCVE] (http://stackoverflow.com/help/mcve) eklemelisiniz. Artık kodunuzu eklediniz, aldığınız tam hata mesajını da ekleyebilir misiniz? Geri tepme, soruna neden olan çizgiyi göstermelidir. – wnnmaw