Knot resolver: Diagnosing failed resolution

Am faced with a failed name resolution again. And wanting to hone my diagnostic skills.

Symptom is simple:

# nslookup firstdonoharm.dev
;; connection timed out; no servers could be reached

So:

# ubus call network.interface.wan status
{
	"up": true,
	"pending": false,
	"available": true,
	"autostart": true,
	"dynamic": false,
	"uptime": 133772,
	"l3_device": "pppoe-wan",
	"proto": "pppoe",
	"device": "eth1",
	"updated": [
		"addresses",
		"routes"
	],
	"metric": 0,
	"dns_metric": 0,
	"delegation": true,
	"ipv4-address": [
		{
			"address": "27.33.62.70",
			"mask": 32
		}
	],
	"ipv6-address": [
		{
			"address": "fe80::8941:d911:878e:cbfc",
			"mask": 128
		}
	],
	"ipv6-prefix": [
		
	],
	"ipv6-prefix-assignment": [
		
	],
	"route": [
		{
			"target": "0.0.0.0",
			"mask": 0,
			"nexthop": "10.20.20.173",
			"source": "0.0.0.0\/0"
		}
	],
	"dns-server": [
		"203.12.160.35",
		"203.12.160.36"
	],
	"dns-search": [
		
	],
	"inactive": {
		"ipv4-address": [
			
		],
		"ipv6-address": [
			
		],
		"route": [
			
		],
		"dns-server": [
			
		],
		"dns-search": [
			
		]
	},
	"data": {
		
	}
}

from which the relevant extract is:

"dns-server": ["203.12.160.35", "203.12.160.36"]

and so:

# nslookup firstdonoharm.dev 203.12.160.35
Server:		203.12.160.35
Address:	203.12.160.35#53

Name:      firstdonoharm.dev
Address 1: 75.2.60.5
Address 2: 2600:1f18:2489:8201:d278:9378:2114:f6e5
Address 3: 2604:a880:400:d0::182c:1001

What is going on here?

For reference:

# cat /etc/config/resolver

config resolver 'common'
	list interface '0.0.0.0'
	list interface '::0'
	option port '53'
	option keyfile '/etc/root.keys'
	option verbose '0'
	option msg_buffer_size '4096'
	option msg_cache_size '20M'
	option net_ipv6 '1'
	option net_ipv4 '1'
	option forward_upstream '1'
	option prefered_resolver 'kresd'
	option ignore_root_key '0'
	option prefetch 'yes'
	option static_domains '1'
	option dynamic_domains '1'

config resolver 'kresd'
	option rundir '/tmp/kresd'
	option log_stderr '1'
	option log_stdout '1'
	option forks '1'
	option keep_cache '0'
	list hostname_config '/etc/hosts'

config resolver 'unbound'
	option outgoing_range '60'
	option outgoing_num_tcp '1'
	option incoming_num_tcp '1'
	option msg_cache_slabs '1'
	option num_queries_per_thread '30'
	option rrset_cache_size '100K'
	option rrset_cache_slabs '1'
	option infra_cache_slabs '1'
	option infra_cache_numhosts '200'
	list access_control '0.0.0.0/0 allow'
	list access_control '::0/0 allow'
	option pidfile '/var/run/unbound.pid'
	option root_hints '/etc/unbound/named.cache'
	option target_fetch_policy '2 1 0 0 0'
	option harden_short_bufsize 'yes'
	option harden_large_queries 'yes'
	option qname_minimisation 'yes'
	option harden_below_nxdomain 'yes'
	option key_cache_size '100k'
	option key_cache_slabs '1'
	option neg_cache_size '10k'
	option prefetch_key 'yes'

config resolver 'unbound_remote_control'
	option control_enable 'yes'
	option control_use_cert 'no'
	list control_interface '127.0.0.1'

If I connect to kresd and do verbose(true) and try again and check /var/log/messages it’s hard to be sure because it’s noisy, lots of requests being logged from my LAN, but searching for firstdonoharm.dev I can find a block of possible interest:

2021-08-04 22:14:14 info kresd[5636]: [00000.00][plan] plan 'firstdonoharm.dev.' type 'A' uid [57935.00]
2021-08-04 22:14:14 info kresd[5636]: [57935.00][iter]   'firstdonoharm.dev.' type 'A' new uid was assigned .01, parent uid .00
2021-08-04 22:14:14 info kresd[5636]: [57935.01][cach]   => trying zone: ., NSEC, hash 0
2021-08-04 22:14:14 info kresd[5636]: [57935.01][cach]   => NSEC sname: range search miss (!covers)
2021-08-04 22:14:14 info kresd[5636]: [57935.01][cach]   => skipping zone: ., NSEC, hash 0;new TTL -123456789, ret -2
2021-08-04 22:14:14 info kresd[5636]: [57935.01][plan]   plan '.' type 'DNSKEY' uid [57935.02]
2021-08-04 22:14:14 info kresd[5636]: [57935.02][iter]     '.' type 'DNSKEY' new uid was assigned .03, parent uid .01
2021-08-04 22:14:14 info kresd[5636]: [57935.03][cach]     => satisfied by exact RRset: rank 060, new TTL 139850
2021-08-04 22:14:14 info kresd[5636]: [57935.03][iter]     <= rcode: NOERROR
2021-08-04 22:14:14 info kresd[5636]: [57935.03][vldr]     <= parent: updating DNSKEY
2021-08-04 22:14:14 info kresd[5636]: [57935.03][vldr]     <= answer valid, OK
2021-08-04 22:14:14 info kresd[5636]: [57935.01][iter]   'firstdonoharm.dev.' type 'A' new uid was assigned .04, parent uid .00
2021-08-04 22:14:14 info kresd[5636]: [57935.04][plan]   plan 'dev.' type 'DS' uid [57935.05]
2021-08-04 22:14:14 info kresd[5636]: [57935.05][iter]     'dev.' type 'DS' new uid was assigned .06, parent uid .04
2021-08-04 22:14:14 info kresd[5636]: [57935.06][cach]     => satisfied by exact RRset: rank 060, new TTL 47461
2021-08-04 22:14:14 info kresd[5636]: [57935.06][iter]     <= rcode: NOERROR
2021-08-04 22:14:14 info kresd[5636]: [57935.06][vldr]     <= DS: OK
2021-08-04 22:14:14 info kresd[5636]: [57935.06][vldr]     <= parent: updating DS
2021-08-04 22:14:14 info kresd[5636]: [57935.06][vldr]     <= answer valid, OK
2021-08-04 22:14:14 info kresd[5636]: [57935.04][iter]   'firstdonoharm.dev.' type 'A' new uid was assigned .07, parent uid .00
2021-08-04 22:14:14 info kresd[5636]: [57935.07][plan]   plan 'dev.' type 'DNSKEY' uid [57935.08]
2021-08-04 22:14:14 info kresd[5636]: [57935.08][iter]     'dev.' type 'DNSKEY' new uid was assigned .09, parent uid .07
2021-08-04 22:14:14 info kresd[5636]: [57935.09][cach]     => skipping exact packet: rank 025 (min. 030), new TTL -848
2021-08-04 22:14:14 info kresd[5636]: [57935.09][cach]     => trying zone: ., NSEC, hash 0
2021-08-04 22:14:14 info kresd[5636]: [57935.09][cach]     => NSEC sname: range search miss (!covers)
2021-08-04 22:14:14 info kresd[5636]: [57935.09][cach]     => skipping zone: ., NSEC, hash 0;new TTL -123456789, ret -2
2021-08-04 22:14:14 info kresd[5636]: [     ][nsre] score 21 for 203.12.160.35#00053;    cached RTT: 37
2021-08-04 22:14:14 info kresd[5636]: [     ][nsre] score 21 for 203.12.160.36#00053;    cached RTT: 1080
2021-08-04 22:14:14 info kresd[5636]: [57935.09][resl]     => id: '17610' querying: '203.12.160.35#00053' score: 21 zone cut: 'dev.' qname: 'dEV.' qtype: 'DNSKEY' proto: 'udp'
2021-08-04 22:14:14 info kresd[5636]: [00000.00][plan] plan 'firstdonoharm.dev.' type 'AAAA' uid [60575.00]
2021-08-04 22:14:14 info kresd[5636]: [60575.00][iter]   'firstdonoharm.dev.' type 'AAAA' new uid was assigned .01, parent uid .00
2021-08-04 22:14:14 info kresd[5636]: [60575.01][cach]   => trying zone: ., NSEC, hash 0
2021-08-04 22:14:14 info kresd[5636]: [60575.01][cach]   => NSEC sname: range search miss (!covers)
2021-08-04 22:14:14 info kresd[5636]: [60575.01][cach]   => skipping zone: ., NSEC, hash 0;new TTL -123456789, ret -2
2021-08-04 22:14:14 info kresd[5636]: [60575.01][plan]   plan '.' type 'DNSKEY' uid [60575.02]
2021-08-04 22:14:14 info kresd[5636]: [60575.02][iter]     '.' type 'DNSKEY' new uid was assigned .03, parent uid .01
2021-08-04 22:14:14 info kresd[5636]: [60575.03][cach]     => satisfied by exact RRset: rank 060, new TTL 139850
2021-08-04 22:14:14 info kresd[5636]: [60575.03][iter]     <= rcode: NOERROR
2021-08-04 22:14:14 info kresd[5636]: [60575.03][vldr]     <= parent: updating DNSKEY
2021-08-04 22:14:14 info kresd[5636]: [60575.03][vldr]     <= answer valid, OK
2021-08-04 22:14:14 info kresd[5636]: [60575.01][iter]   'firstdonoharm.dev.' type 'AAAA' new uid was assigned .04, parent uid .00
2021-08-04 22:14:14 info kresd[5636]: [60575.04][plan]   plan 'dev.' type 'DS' uid [60575.05]
2021-08-04 22:14:14 info kresd[5636]: [60575.05][iter]     'dev.' type 'DS' new uid was assigned .06, parent uid .04
2021-08-04 22:14:14 info kresd[5636]: [60575.06][cach]     => satisfied by exact RRset: rank 060, new TTL 47461
2021-08-04 22:14:14 info kresd[5636]: [60575.06][iter]     <= rcode: NOERROR
2021-08-04 22:14:14 info kresd[5636]: [60575.06][vldr]     <= DS: OK
2021-08-04 22:14:14 info kresd[5636]: [60575.06][vldr]     <= parent: updating DS
2021-08-04 22:14:14 info kresd[5636]: [60575.06][vldr]     <= answer valid, OK
2021-08-04 22:14:14 info kresd[5636]: [60575.04][iter]   'firstdonoharm.dev.' type 'AAAA' new uid was assigned .07, parent uid .00
2021-08-04 22:14:14 info kresd[5636]: [60575.07][plan]   plan 'dev.' type 'DNSKEY' uid [60575.08]
2021-08-04 22:14:14 info kresd[5636]: [60575.08][iter]     'dev.' type 'DNSKEY' new uid was assigned .09, parent uid .07
2021-08-04 22:14:14 info kresd[5636]: [60575.09][cach]     => skipping exact packet: rank 025 (min. 030), new TTL -848
2021-08-04 22:14:14 info kresd[5636]: [60575.09][cach]     => trying zone: ., NSEC, hash 0
2021-08-04 22:14:14 info kresd[5636]: [60575.09][cach]     => NSEC sname: range search miss (!covers)
2021-08-04 22:14:14 info kresd[5636]: [60575.09][cach]     => skipping zone: ., NSEC, hash 0;new TTL -123456789, ret -2
2021-08-04 22:14:14 info kresd[5636]: [     ][nsre] score 21 for 203.12.160.35#00053;    cached RTT: 37
2021-08-04 22:14:14 info kresd[5636]: [     ][nsre] score 21 for 203.12.160.36#00053;    cached RTT: 1080
2021-08-04 22:14:14 info kresd[5636]: [60575.09][iter]     <= rcode: NOERROR
2021-08-04 22:14:14 info kresd[5636]: [60575.09][resl]     <= server: '203.12.160.35' rtt: 3 ms
2021-08-04 22:14:14 info kresd[5636]: [60575.09][resl]     => resuming yielded answer
2021-08-04 22:14:14 info kresd[5636]: [60575.09][vldr]     <= bad NODATA proof
2021-08-04 22:14:14 info kresd[5636]: [60575.09][cach]     => stashed packet: rank 025, TTL 5, DNSKEY dev. (44 B)
2021-08-04 22:14:14 info kresd[5636]: [60575.00][resl] request failed, answering with empty SERVFAIL
2021-08-04 22:14:14 info kresd[5636]: [60575.09][resl]     finished in state: 8, queries: 2, mempool: 65568 B
2021-08-04 22:14:14 info kresd[5636]: [57935.09][iter]     <= rcode: NOERROR
2021-08-04 22:14:14 info kresd[5636]: [57935.09][resl]     <= server: '203.12.160.35' rtt: 3 ms
2021-08-04 22:14:14 info kresd[5636]: [57935.09][resl]     => resuming yielded answer
2021-08-04 22:14:14 info kresd[5636]: [57935.09][vldr]     <= bad NODATA proof
2021-08-04 22:14:14 info kresd[5636]: [57935.09][cach]     => stashed packet: rank 025, TTL 5, DNSKEY dev. (44 B)
2021-08-04 22:14:14 info kresd[5636]: [57935.00][resl] request failed, answering with empty SERVFAIL

Which is hard to read alas. But looks like it’s getting responses from 203.12.160.35 juts fine, but for some cryptic reason concluding with SERVFAIL.

Part of me wonders if there is a way fo testing this in a more contain manner. /var/log/messages is simply very noisy. Kresd has a resolve() function but it’s not well documented if an how to use that via a CLI:

https://knot-resolver.readthedocs.io/en/stable/daemon-scripting.html#resolve

as it has a callback, and frankly it would be lovely to see an example of a simple “print the response” callback function and a away to define that if it’s not already defined so it’s available to specify as a function on the CLI.

Essentially if I could ask kresd to resolve(‘url’) and get a diagnostic trace of effort and result on screen that would be isolated from the many requests that land in /var/log/messages.

It would rock immensely if the output also were a little more lucid than what we see in /var/log/messages as that seems to me like it would only make sense to someone working on the kresd code base currently or able to consult it to see what those messages mean (in short they seem to appeal to arcane knowledge ;-).

I would love to understand why this domain is not resolving. It’s not a crisis, system is working, day to day, and I don’t need that site. But I noticed it was not resolving, so checked with downforme, and it was in fact just down for me only.

Alas, where I’m at is wanting to gain the skills to answer this question myself, me, so I started looking at why. Appreciate any upskilling you can offer me here. What to look at, and how to work out what is going on here.

1 Like

The server claims that dev. DNSKEY does not exist. Without a proof. (It’s wrong, because that record does exist.)

Is it this output that you’re interpreting as such. As I understand it the DNSKEY is part of DNSSEC (security aiming to ensure that the served name is authoritative and set by the domain’s owner and not some intermediary agent with malintent).

Your conclusion then if I understand it rightly is:

  1. 203.12.160.35 is not delivering a DNSKEY when requested
  2. The domain (.dev or firstdonoharm.dev?) does have a DNSKEY that should be delivered.

Or have I misunderstood?

Still curious to know if I can test a given domain like this using the resolve() CLI command in knot (with a simple callback to print results) rather than going via messages and turning all requests to verbose flooding messages fast.

In a nutshell though your suggesting the upstream server isn’t delivering DNSKEY. Yet:

# nslookup firstdonoharm.dev 203.12.160.35
Server:		203.12.160.35
Address:	203.12.160.35#53

Name:      firstdonoharm.dev
Address 1: 75.2.60.5
Address 2: 2600:1f18:2489:8201:d278:9378:2114:f6e5
Address 3: 2604:a880:400:d0::182c:1001

would imply then that nslookup doesn’t care, and isn’t checking for one, but kresd is and this relates to the DNSSEC I imagine. Yes?

This :arrow_up: is the query that doesn’t get a good answer.

The resolve() function doesn’t allow easy inspection of the result (without writing several lines of code). The result also isn’t immediately returned, as it’s computed asynchronously… by which time you have no good way of sending it back to the “person” who invoked the function (e.g. which CLI connection ran it).

It’s possible to log just some requests, e.g. just those that fail and just some subtrees, etc. (including logs in the failing request before the failure happened!) See docs around: Query policies — Knot Resolver 5.4.0 documentation

nslookup, (k)dig, etc. just ask a server (or operating system) and return what it returned. They don’t validate DNSSEC.

I’d be happy to write several lines of code ;-). So if you have an example that might coax it out of the CLI I’m all ears. I will of course look at the targeted logging, but a an interactive method, even a short script I could write in lua if need be, to run that does a single on demand test of a resolution dumping the debug trace of just that request, independent of the servers general traffic would have, to my mind, significant value!

1 Like

Hint, hint … would love to see those several lines of code :wink:

Well, I don’t have them at this moment… I don’t think we’ve ever done it. By “write” I didn’t mean copy&paste from somewhere. It seems like a useful idea, but right now there’s too much other stuff to do.

1 Like

That’s unfortunate. I’d love to know how to do it. A simple lua script that prints to console I guess. Not much familiar with lua.