Skip to content
This repository was archived by the owner on Sep 7, 2023. It is now read-only.
Permalink

Comparing changes

Choose two branches to see what’s changed or to start a new pull request. If you need to, you can also or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: searx/searx
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: master
Choose a base ref
...
head repository: privacore/searx
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: master
Choose a head ref
Can’t automatically merge. Don’t worry, you can still create the pull request.
  • 7 commits
  • 5 files changed
  • 1 contributor

Commits on Sep 17, 2018

  1. Use a more human-friendly format in searx/data/engines_languages.json

    Instead of a single line with 500000 characters use nicely formatted JSON.
    Sort the lists in engine_languages.py so when updating it is possible to
    more easily see the differences (search engines do change the order their
    languages are listed in)
    isj-privacore committed Sep 17, 2018
    Copy the full SHA
    9ab439f View commit details
  2. Use stronger language hints for yahoo

    Specifying the CGI parameter "&vl=lang_XX" only slightly skews the result toward that language. search.yahoo.com has a strong English bias so for cross-langauge queries eg. "ronaldinho" or "currywurst" most results are still English even if you specify a other language with vl=lang_XX. What works better is to use the dedicated subdomain for the country where that language is spoken and also specifying the fr2=sb-top-XXX.
    
    14 languages and 30 countries where we can make stronger hints have been found manually and now gives tronger hints to yahoo.
    
    Language codes in both the simple iso-639-1 (eg. "fr") and the more complex iso-639-1 '-' iso-3166 (eg. "fr-BE") are supported by the engine.
    isj-privacore committed Sep 17, 2018
    Copy the full SHA
    aeb997a View commit details

Commits on Sep 18, 2018

  1. Copy the full SHA
    c068496 View commit details

Commits on Oct 8, 2018

  1. Support per-query time limit

    The CGI parameter 'time_limit', if specified, overrides the time_limit
    specified globally and per-engine. This is mostly useful for programs that use
    the JSON results and have a longer patience than interactive users.
    isj-privacore committed Oct 8, 2018
    Copy the full SHA
    4629001 View commit details
  2. Copy the full SHA
    c790ae4 View commit details
  3. Copy the full SHA
    18dbd07 View commit details
  4. Copy the full SHA
    fadede7 View commit details
Showing with 27,367 additions and 14 deletions.
  1. +27,253 −1 searx/data/engines_languages.json
  2. +98 −10 searx/engines/yahoo.py
  3. +2 −1 searx/query.py
  4. +11 −1 searx/search.py
  5. +3 −1 utils/fetch_languages.py
27,254 changes: 27,253 additions & 1 deletion searx/data/engines_languages.json

Large diffs are not rendered by default.

108 changes: 98 additions & 10 deletions searx/engines/yahoo.py
Original file line number Diff line number Diff line change
@@ -23,11 +23,51 @@
time_range_support = True

# search-url
base_url = 'https://search.yahoo.com/'
search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
# Specifying language with "&vl=lang_{lang}", eg "vl=lang_it" has some effect, but not much.
# Using dedicated subdomains is better, but doesn't work well for all languages (and sometimes
# they don't exist at all). Using eg. "fr2=sb-top-it.search" combined with subdomain has good
# effect. Using all three seem to force the results into that language most of the time.
default_base_url = 'https://search.yahoo.com/'
search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&fr2={fr2}'
search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&fr2={fr2}&age={age}&btf={btf}&fr2=time'

supported_languages_url = 'https://search.yahoo.com/web/advanced'
# The (advanced) language page only lists the iso-639-1 codes, even though form iso-639-1 '-' iso-3166 is supported.
# We use this extended list for languages that have regional variations that are supported by yahoo and where
# the <country_code>.search.yahoo.com DNS entry exists. These have been found manually:
extended_supported_languages = set([
'da-DK',
'de-AT',
'de-CH',
'de-DE',
'en-AU',
'en-CA',
'en-IE',
'en-NZ',
'en-UK',
'en-ZA',
'es-CO',
'es-ES',
'es-MX',
'fi-FI',
'fr-BE',
'fr-CA',
'fr-CH',
'fr-FR',
'hu-HU',
'it-IT',
'nb-NO',
'nn-NO',
'no-NO',
'nl-BE',
'nl-NL',
'pl-PL',
'pt-BR',
'pt-PT',
'ro-RO',
'sv-SE',
])
extended_supported_languages_lowercase = set([s.lower() for s in extended_supported_languages])

# specific xpath variables
results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
@@ -43,6 +83,41 @@
language_aliases = {'zh-CN': 'zh-CHS', 'zh-TW': 'zh-CHT', 'zh-HK': 'zh-CHT'}


# Choose the best (base-url,sb-top-thing) for a given language
def base_url_and_parameter_for_language(lang):
lang = lang.lower()
if lang in extended_supported_languages_lowercase:
# lang-country has extended support
country = lang.split('-')[1]
base_url = "https://" + country + ".search.yahoo.com/"
fr2 = "sb-top-" + country + ".search"
return (base_url, fr2)
# is it for a 1:1 lang-country (eg fi-fi)?
number_of_lang_matches = 0
lang_match = None
for lang_country in extended_supported_languages_lowercase:
if lang.split('-')[0] == lang_country.split('-')[0]:
number_of_lang_matches += 1
lang_match = lang_country
if number_of_lang_matches == 1:
# found a single match on language, so it must be a the case of lang='xx' and
# extended_supported_languages_lowercase having element 'xx-yy'
# assume there is a 1:1 between language and country, eg sv-se.
country = lang_match.split('-')[1]
base_url = "https://" + country + ".search.yahoo.com/"
fr2 = "sb-top-" + country + ".search"
return (base_url, fr2)
elif number_of_lang_matches > 1:
# The language is used in multiple countries and the user specified the language only (no country/region).
# It is best not to use strong hints because otherwise we may get irrelevant results for the user.
# Eg. if we get the query "billets d'avion pour Berlin" and lang=fr, then we don't wnat to focus it on
# french-french results because it could be a canadian, belgian or swiss, and all the fabulous offers from
# CDG are that relevant.
pass
# no country specified in language code or unknown language so we can't pick a subdomain
return (default_base_url, "sb-top-search")


# remove yahoo-specific tracking-url
def parse_url(url_string):
endings = ['/RS', '/RK']
@@ -62,15 +137,18 @@ def parse_url(url_string):


def _get_url(query, offset, language, time_range):
(base_url, fr2) = base_url_and_parameter_for_language(language)
if time_range in time_range_dict:
return base_url + search_url_with_time.format(offset=offset,
query=urlencode({'p': query}),
lang=language,
lang=language.replace('-', '_').lower(),
fr2=fr2,
age=time_range_dict[time_range][0],
btf=time_range_dict[time_range][1])
return base_url + search_url.format(offset=offset,
query=urlencode({'p': query}),
lang=language)
lang=language.replace('-', '_').lower(),
fr2=fr2)


# do search-request
@@ -80,15 +158,12 @@ def request(query, params):

offset = (params['pageno'] - 1) * 10 + 1
language = match_language(params['language'], supported_languages, language_aliases)
if language not in language_aliases.values():
language = language.split('-')[0]
language = language.replace('-', '_').lower()

params['url'] = _get_url(query, offset, language, params['time_range'])

# TODO required?
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
.format(lang=language)
.format(lang=language.replace('-', '_').lower())

return params

@@ -144,8 +219,21 @@ def _fetch_supported_languages(resp):
code_parts = option.xpath('./@value')[0][5:].split('_')
if len(code_parts) == 2:
code = code_parts[0] + '-' + code_parts[1].upper()
supported_languages.append(code)
else:
code = code_parts[0]
supported_languages.append(code)
if code in extended_supported_languages:
supported_languages.append(code)
else:
# The page lists only the language but not the language-regions supported.
# Eg. only "fr" is listed but fr-CA, fr-FR, fr-BE, fr-CH are supported so
# override with prefix-matches from extended_supported_languages
extended_langs_matches_found = 0
for extended_lang in extended_supported_languages:
if extended_lang.split('-')[0] == code:
supported_languages.append(extended_lang)
extended_langs_matches_found += 1
if extended_langs_matches_found == 0:
supported_languages.append(code)

return supported_languages
3 changes: 2 additions & 1 deletion searx/query.py
Original file line number Diff line number Diff line change
@@ -163,14 +163,15 @@ def getFullQuery(self):
class SearchQuery(object):
"""container for all the search parameters (query, language, etc...)"""

def __init__(self, query, engines, categories, lang, safesearch, pageno, time_range):
def __init__(self, query, engines, categories, lang, safesearch, pageno, time_range, time_limit):
self.query = query.encode('utf-8')
self.engines = engines
self.categories = categories
self.lang = lang
self.safesearch = safesearch
self.pageno = pageno
self.time_range = time_range
self.time_limit = time_limit

def __str__(self):
return str(self.query) + ";" + str(self.engines)
12 changes: 11 additions & 1 deletion searx/search.py
Original file line number Diff line number Diff line change
@@ -250,6 +250,12 @@ def get_search_query_from_webapp(preferences, form):
if query_time_range not in ('None', None, '', 'day', 'week', 'month', 'year'):
raise SearxParameterException('time_range', query_time_range)

if 'time_limit' in form:
time_limit_param = form.get('time_limit')
time_limit = float(time_limit_param)
else:
time_limit = None

# query_engines
query_engines = raw_text_query.engines

@@ -324,7 +330,7 @@ def get_search_query_from_webapp(preferences, form):
if (engine.name, categ) not in disabled_engines)

return SearchQuery(query, query_engines, query_categories,
query_lang, query_safesearch, query_pageno, query_time_range)
query_lang, query_safesearch, query_pageno, query_time_range, time_limit)


class Search(object):
@@ -408,6 +414,10 @@ def search(self):
# update timeout_limit
timeout_limit = max(timeout_limit, engine.timeout)

# if 'time_limit' was specified in the query then use that
if search_query.time_limit is not None:
timeout_limit = search_query.time_limit

if requests:
# send all search-request
search_multiple_requests(requests, self.result_container, start_time, timeout_limit)
4 changes: 3 additions & 1 deletion utils/fetch_languages.py
Original file line number Diff line number Diff line change
@@ -27,12 +27,14 @@ def fetch_supported_languages():
if hasattr(engines[engine_name], 'fetch_supported_languages'):
try:
engines_languages[engine_name] = engines[engine_name].fetch_supported_languages()
if type(engines_languages[engine_name]) == list:
engines_languages[engine_name] = sorted(engines_languages[engine_name])
except Exception as e:
print(e)

# write json file
with io.open(engines_languages_file, "w", encoding="utf-8") as f:
dump(engines_languages, f, ensure_ascii=False)
dump(engines_languages, f, ensure_ascii=False, indent=4, separators=(',', ': '))

return engines_languages