From 4ca67b74bce538aa2f59e4aebc94db265fb1b1f8 Mon Sep 17 00:00:00 2001 From: Scott Wallace Date: Sat, 7 Dec 2019 07:53:31 +0000 Subject: [PATCH] Fix Google search engine for Searx --- .../searx_server/files/engines/google.py | 68 +++---------------- 1 file changed, 8 insertions(+), 60 deletions(-) diff --git a/.ansible/roles/searx_server/files/engines/google.py b/.ansible/roles/searx_server/files/engines/google.py index 98b49f7..0575efe 100644 --- a/.ansible/roles/searx_server/files/engines/google.py +++ b/.ansible/roles/searx_server/files/engines/google.py @@ -107,13 +107,12 @@ images_path = '/images' supported_languages_url = 'https://www.google.com/preferences?#languages' # specific xpath variables -results_xpath = '//div[@class="g"]' -url_xpath = './/h3/a/@href' -title_xpath = './/h3' -content_xpath = './/span[@class="st"]' -content_misc_xpath = './/div[@class="f slp"]' -suggestion_xpath = '//p[@class="_Bmc"]' -spelling_suggestion_xpath = '//a[@class="spell"]' +results_xpath = '//div[contains(@class, "ZINbbc")]' +url_xpath = './/div[@class="kCrYT"][1]/a/@href' +title_xpath = './/div[@class="kCrYT"][1]/a/div[1]' +content_xpath = './/div[@class="kCrYT"][2]//div[contains(@class, "BNeawe")]//div[contains(@class, "BNeawe")]' +suggestion_xpath = '//div[contains(@class, "ZINbbc")][last()]//div[@class="rVLSBd"]/a//div[contains(@class, "BNeawe")]' +spelling_suggestion_xpath = '//div[@id="scc"]//a' # map : detail location map_address_xpath = './/div[@class="s"]//table//td[2]/span/text()' @@ -169,7 +168,7 @@ def request(query, params): if params['language'] == 'all' or params['language'] == 'en-US': language = 'en-GB' else: - language = match_language(params['language'], supported_languages, {}) + language = match_language(params['language'], supported_languages, language_aliases) language_array = language.split('-') if params['language'].find('-') > 0: @@ -199,9 +198,6 @@ def request(query, params): params['headers']['Accept-Language'] = language + ',' + language + '-' + country params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' - # Force Internet Explorer 12 user agent to avoid loading the new UI that Searx can't parse - params['headers']['User-Agent'] = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Version/3.0 Safari/525.13" - params['google_hostname'] = google_hostname return params @@ -273,9 +269,7 @@ def response(resp): content = extract_text_from_dom(result, content_xpath) if content is None: continue - content_misc = extract_text_from_dom(result, content_misc_xpath) - if content_misc is not None: - content = content_misc + "
" + content + # append result results.append({'url': url, 'title': title, @@ -285,52 +279,6 @@ def response(resp): logger.debug('result parse error in:\n%s', etree.tostring(result, pretty_print=True)) continue - logger.debug(results) - if not results: - logger.debug('SCOTT trying new interface') - a_tags = dom.xpath('//a') - for a_tag in a_tags: - href = a_tag.get('href', '') - url = dict(parse_qsl(href)).get('/url?q', None) - if not (url and url.startswith('http')): - continue - ancestors = [x for x in a_tag.iterancestors()] - try: - title_divs = a_tag.xpath('./div') - if title_divs: - title = title_divs[0].text.strip().replace('\n', ' ').replace(' ', ' ') - else: - title = None - item_div = ancestors[2] - content_divs = item_div.xpath('./div/div[3]/div/div/div/div/div[1]/div') - if content_divs: - content = content_divs[0].text.strip() - else: - content_divs = item_div.xpath('./div/div[3]/div/div/div/div/div') - if content_divs: - content = content_divs[0].text.strip().replace('\n', ' ').replace(' ', ' ') - else: - content = None - if not content: - content_divs = ancestors[1].xpath('./div/div/div/div/div/div') - if content_divs: - content = extract_text(content_divs[0]) - pass - if title or content: - if any(x['url'] == url for x in results): - results_item = results.pop( - results.index( - [x for x in results if x['url'] == url][0])) - for key, var in (('title', title), ('content', content)): - if not results_item[key]: - results_item[key] = var - results.append(results_item) - else: - results.append({ - 'url': url, 'title': title, 'content': content}) - except Exception as err: - logger.error(err, exc_info=1) - # parse suggestion for suggestion in dom.xpath(suggestion_xpath): # append suggestion